GCC Code Coverage Report


Directory: ./
File: Objects/unicodeobject.c
Date: 2022-06-27 09:02:04
Exec Total Coverage
Lines: 3194 6662 47.9%
Functions: 234 314 74.5%
Branches: 1782 4494 39.7%

Line Branch Exec Source
1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9 Copyright (c) Corporation for National Research Initiatives.
10
11 --------------------------------------------------------------------
12 The original string type implementation is:
13
14 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
16
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38
39 */
40
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "pycore_abstract.h" // _PyIndex_Check()
44 #include "pycore_atomic_funcs.h" // _Py_atomic_size_get()
45 #include "pycore_bytesobject.h" // _PyBytes_Repeat()
46 #include "pycore_bytes_methods.h" // _Py_bytes_lower()
47 #include "pycore_format.h" // F_LJUST
48 #include "pycore_initconfig.h" // _PyStatus_OK()
49 #include "pycore_interp.h" // PyInterpreterState.fs_codec
50 #include "pycore_long.h" // _PyLong_FormatWriter()
51 #include "pycore_object.h" // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
52 #include "pycore_pathconfig.h" // _Py_DumpPathConfig()
53 #include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
54 #include "pycore_pystate.h" // _PyInterpreterState_GET()
55 #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
56 #include "pycore_unicodeobject.h" // struct _Py_unicode_state
57 #include "stringlib/eq.h" // unicode_eq()
58
59 #ifdef MS_WINDOWS
60 #include <windows.h>
61 #endif
62
63 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
64 # include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
65 #endif
66
67 /* Uncomment to display statistics on interned strings at exit
68 in _PyUnicode_ClearInterned(). */
69 /* #define INTERNED_STATS 1 */
70
71
72 /*[clinic input]
73 class str "PyObject *" "&PyUnicode_Type"
74 [clinic start generated code]*/
75 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
76
77 /*[python input]
78 class Py_UCS4_converter(CConverter):
79 type = 'Py_UCS4'
80 converter = 'convert_uc'
81
82 def converter_init(self):
83 if self.default is not unspecified:
84 self.c_default = ascii(self.default)
85 if len(self.c_default) > 4 or self.c_default[0] != "'":
86 self.c_default = hex(ord(self.default))
87
88 [python start generated code]*/
89 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
90
91 /* --- Globals ------------------------------------------------------------
92
93 NOTE: In the interpreter's initialization phase, some globals are currently
94 initialized dynamically as needed. In the process Unicode objects may
95 be created before the Unicode type is ready.
96
97 */
98
99
100 #ifdef __cplusplus
101 extern "C" {
102 #endif
103
104 // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
105 // The value must be the same in fileutils.c.
106 #define MAX_UNICODE 0x10ffff
107
108 #ifdef Py_DEBUG
109 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
110 #else
111 # define _PyUnicode_CHECK(op) PyUnicode_Check(op)
112 #endif
113
114 #define _PyUnicode_UTF8(op) \
115 (_PyCompactUnicodeObject_CAST(op)->utf8)
116 #define PyUnicode_UTF8(op) \
117 (assert(_PyUnicode_CHECK(op)), \
118 PyUnicode_IS_COMPACT_ASCII(op) ? \
119 ((char*)(_PyASCIIObject_CAST(op) + 1)) : \
120 _PyUnicode_UTF8(op))
121 #define _PyUnicode_UTF8_LENGTH(op) \
122 (_PyCompactUnicodeObject_CAST(op)->utf8_length)
123 #define PyUnicode_UTF8_LENGTH(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 PyUnicode_IS_COMPACT_ASCII(op) ? \
126 _PyASCIIObject_CAST(op)->length : \
127 _PyUnicode_UTF8_LENGTH(op))
128
129 #define _PyUnicode_LENGTH(op) \
130 (_PyASCIIObject_CAST(op)->length)
131 #define _PyUnicode_STATE(op) \
132 (_PyASCIIObject_CAST(op)->state)
133 #define _PyUnicode_HASH(op) \
134 (_PyASCIIObject_CAST(op)->hash)
135 #define _PyUnicode_KIND(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 _PyASCIIObject_CAST(op)->state.kind)
138 #define _PyUnicode_GET_LENGTH(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 _PyASCIIObject_CAST(op)->length)
141 #define _PyUnicode_DATA_ANY(op) \
142 (_PyUnicodeObject_CAST(op)->data.any)
143
144 #define _PyUnicode_SHARE_UTF8(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
147 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148
149 /* true if the Unicode object has an allocated UTF-8 memory block
150 (not shared with other data) */
151 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
152 ((!PyUnicode_IS_COMPACT_ASCII(op) \
153 && _PyUnicode_UTF8(op) \
154 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
155
156 /* Generic helper macro to convert characters of different types.
157 from_type and to_type have to be valid type names, begin and end
158 are pointers to the source characters which should be of type
159 "from_type *". to is a pointer of type "to_type *" and points to the
160 buffer where the result characters are written to. */
161 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
162 do { \
163 to_type *_to = (to_type *)(to); \
164 const from_type *_iter = (const from_type *)(begin);\
165 const from_type *_end = (const from_type *)(end);\
166 Py_ssize_t n = (_end) - (_iter); \
167 const from_type *_unrolled_end = \
168 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
169 while (_iter < (_unrolled_end)) { \
170 _to[0] = (to_type) _iter[0]; \
171 _to[1] = (to_type) _iter[1]; \
172 _to[2] = (to_type) _iter[2]; \
173 _to[3] = (to_type) _iter[3]; \
174 _iter += 4; _to += 4; \
175 } \
176 while (_iter < (_end)) \
177 *_to++ = (to_type) *_iter++; \
178 } while (0)
179
180 #define LATIN1(ch) \
181 (ch < 128 \
182 ? (PyObject*)&_Py_SINGLETON(strings).ascii[ch] \
183 : (PyObject*)&_Py_SINGLETON(strings).latin1[ch - 128])
184
185 #ifdef MS_WINDOWS
186 /* On Windows, overallocate by 50% is the best factor */
187 # define OVERALLOCATE_FACTOR 2
188 #else
189 /* On Linux, overallocate by 25% is the best factor */
190 # define OVERALLOCATE_FACTOR 4
191 #endif
192
193 /* This dictionary holds all interned unicode strings. Note that references
194 to strings in this dictionary are *not* counted in the string's ob_refcnt.
195 When the interned string reaches a refcnt of 0 the string deallocation
196 function will delete the reference from this dictionary.
197
198 Another way to look at this is that to say that the actual reference
199 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
200 */
201 static PyObject *interned = NULL;
202
203 /* Forward declaration */
204 static inline int
205 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
206 static inline void
207 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
208 static PyObject *
209 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
210 const char *errors);
211 static PyObject *
212 unicode_decode_utf8(const char *s, Py_ssize_t size,
213 _Py_error_handler error_handler, const char *errors,
214 Py_ssize_t *consumed);
215 #ifdef Py_DEBUG
216 static inline int unicode_is_finalizing(void);
217 static int unicode_is_singleton(PyObject *unicode);
218 #endif
219
220
221 // Return a borrowed reference to the empty string singleton.
222 57584004 static inline PyObject* unicode_get_empty(void)
223 {
224 _Py_DECLARE_STR(empty, "");
225 57584004 return &_Py_STR(empty);
226 }
227
228
229 // Return a strong reference to the empty string singleton.
230 5250854 static inline PyObject* unicode_new_empty(void)
231 {
232 5250854 PyObject *empty = unicode_get_empty();
233 5250854 Py_INCREF(empty);
234 5250854 return empty;
235 }
236
237 #define _Py_RETURN_UNICODE_EMPTY() \
238 do { \
239 return unicode_new_empty(); \
240 } while (0)
241
242 static inline void
243 250415 unicode_fill(int kind, void *data, Py_UCS4 value,
244 Py_ssize_t start, Py_ssize_t length)
245 {
246 assert(0 <= start);
247
1/4
✓ Branch 0 taken 250415 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
250415 switch (kind) {
248 250415 case PyUnicode_1BYTE_KIND: {
249 assert(value <= 0xff);
250 250415 Py_UCS1 ch = (unsigned char)value;
251 250415 Py_UCS1 *to = (Py_UCS1 *)data + start;
252 250415 memset(to, ch, length);
253 250415 break;
254 }
255 case PyUnicode_2BYTE_KIND: {
256 assert(value <= 0xffff);
257 Py_UCS2 ch = (Py_UCS2)value;
258 Py_UCS2 *to = (Py_UCS2 *)data + start;
259 const Py_UCS2 *end = to + length;
260 for (; to < end; ++to) *to = ch;
261 break;
262 }
263 case PyUnicode_4BYTE_KIND: {
264 assert(value <= MAX_UNICODE);
265 Py_UCS4 ch = value;
266 Py_UCS4 * to = (Py_UCS4 *)data + start;
267 const Py_UCS4 *end = to + length;
268 for (; to < end; ++to) *to = ch;
269 break;
270 }
271 default: Py_UNREACHABLE();
272 }
273 250415 }
274
275
276 /* Fast detection of the most frequent whitespace characters */
277 const unsigned char _Py_ascii_whitespace[] = {
278 0, 0, 0, 0, 0, 0, 0, 0,
279 /* case 0x0009: * CHARACTER TABULATION */
280 /* case 0x000A: * LINE FEED */
281 /* case 0x000B: * LINE TABULATION */
282 /* case 0x000C: * FORM FEED */
283 /* case 0x000D: * CARRIAGE RETURN */
284 0, 1, 1, 1, 1, 1, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 /* case 0x001C: * FILE SEPARATOR */
287 /* case 0x001D: * GROUP SEPARATOR */
288 /* case 0x001E: * RECORD SEPARATOR */
289 /* case 0x001F: * UNIT SEPARATOR */
290 0, 0, 0, 0, 1, 1, 1, 1,
291 /* case 0x0020: * SPACE */
292 1, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0
305 };
306
307 /* forward */
308 static PyObject* get_latin1_char(unsigned char ch);
309 static int unicode_modifiable(PyObject *unicode);
310
311
312 static PyObject *
313 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
314 static PyObject *
315 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
316 static PyObject *
317 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
318
319 static PyObject *
320 unicode_encode_call_errorhandler(const char *errors,
321 PyObject **errorHandler,const char *encoding, const char *reason,
322 PyObject *unicode, PyObject **exceptionObject,
323 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
324
325 static void
326 raise_encode_exception(PyObject **exceptionObject,
327 const char *encoding,
328 PyObject *unicode,
329 Py_ssize_t startpos, Py_ssize_t endpos,
330 const char *reason);
331
332 /* Same for linebreaks */
333 static const unsigned char ascii_linebreak[] = {
334 0, 0, 0, 0, 0, 0, 0, 0,
335 /* 0x000A, * LINE FEED */
336 /* 0x000B, * LINE TABULATION */
337 /* 0x000C, * FORM FEED */
338 /* 0x000D, * CARRIAGE RETURN */
339 0, 0, 1, 1, 1, 1, 0, 0,
340 0, 0, 0, 0, 0, 0, 0, 0,
341 /* 0x001C, * FILE SEPARATOR */
342 /* 0x001D, * GROUP SEPARATOR */
343 /* 0x001E, * RECORD SEPARATOR */
344 0, 0, 0, 0, 1, 1, 1, 0,
345 0, 0, 0, 0, 0, 0, 0, 0,
346 0, 0, 0, 0, 0, 0, 0, 0,
347 0, 0, 0, 0, 0, 0, 0, 0,
348 0, 0, 0, 0, 0, 0, 0, 0,
349
350 0, 0, 0, 0, 0, 0, 0, 0,
351 0, 0, 0, 0, 0, 0, 0, 0,
352 0, 0, 0, 0, 0, 0, 0, 0,
353 0, 0, 0, 0, 0, 0, 0, 0,
354 0, 0, 0, 0, 0, 0, 0, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0
358 };
359
360 static int convert_uc(PyObject *obj, void *addr);
361
362 struct encoding_map;
363 #include "clinic/unicodeobject.c.h"
364
365 _Py_error_handler
366 501894 _Py_GetErrorHandler(const char *errors)
367 {
368
3/4
✓ Branch 0 taken 501321 times.
✓ Branch 1 taken 573 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 501321 times.
501894 if (errors == NULL || strcmp(errors, "strict") == 0) {
369 573 return _Py_ERROR_STRICT;
370 }
371
2/2
✓ Branch 0 taken 501320 times.
✓ Branch 1 taken 1 times.
501321 if (strcmp(errors, "surrogateescape") == 0) {
372 501320 return _Py_ERROR_SURROGATEESCAPE;
373 }
374
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (strcmp(errors, "replace") == 0) {
375 return _Py_ERROR_REPLACE;
376 }
377
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (strcmp(errors, "ignore") == 0) {
378 return _Py_ERROR_IGNORE;
379 }
380
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (strcmp(errors, "backslashreplace") == 0) {
381 return _Py_ERROR_BACKSLASHREPLACE;
382 }
383
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (strcmp(errors, "surrogatepass") == 0) {
384 1 return _Py_ERROR_SURROGATEPASS;
385 }
386 if (strcmp(errors, "xmlcharrefreplace") == 0) {
387 return _Py_ERROR_XMLCHARREFREPLACE;
388 }
389 return _Py_ERROR_OTHER;
390 }
391
392
393 static _Py_error_handler
394 1198795 get_error_handler_wide(const wchar_t *errors)
395 {
396
2/4
✓ Branch 0 taken 1198795 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1198795 times.
1198795 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
397 return _Py_ERROR_STRICT;
398 }
399
1/2
✓ Branch 0 taken 1198795 times.
✗ Branch 1 not taken.
1198795 if (wcscmp(errors, L"surrogateescape") == 0) {
400 1198795 return _Py_ERROR_SURROGATEESCAPE;
401 }
402 if (wcscmp(errors, L"replace") == 0) {
403 return _Py_ERROR_REPLACE;
404 }
405 if (wcscmp(errors, L"ignore") == 0) {
406 return _Py_ERROR_IGNORE;
407 }
408 if (wcscmp(errors, L"backslashreplace") == 0) {
409 return _Py_ERROR_BACKSLASHREPLACE;
410 }
411 if (wcscmp(errors, L"surrogatepass") == 0) {
412 return _Py_ERROR_SURROGATEPASS;
413 }
414 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
415 return _Py_ERROR_XMLCHARREFREPLACE;
416 }
417 return _Py_ERROR_OTHER;
418 }
419
420
421 static inline int
422 3873658 unicode_check_encoding_errors(const char *encoding, const char *errors)
423 {
424
3/4
✓ Branch 0 taken 75259 times.
✓ Branch 1 taken 3798399 times.
✓ Branch 2 taken 75259 times.
✗ Branch 3 not taken.
3873658 if (encoding == NULL && errors == NULL) {
425 75259 return 0;
426 }
427
428 3798399 PyInterpreterState *interp = _PyInterpreterState_GET();
429 #ifndef Py_DEBUG
430 /* In release mode, only check in development mode (-X dev) */
431
1/2
✓ Branch 1 taken 3798399 times.
✗ Branch 2 not taken.
3798399 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
432 3798399 return 0;
433 }
434 #else
435 /* Always check in debug mode */
436 #endif
437
438 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
439 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
440 if (!interp->unicode.fs_codec.encoding) {
441 return 0;
442 }
443
444 /* Disable checks during Python finalization. For example, it allows to
445 call _PyObject_Dump() during finalization for debugging purpose. */
446 if (interp->finalizing) {
447 return 0;
448 }
449
450 if (encoding != NULL
451 // Fast path for the most common built-in encodings. Even if the codec
452 // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
453 // create a temporary Unicode string (the key in the cache).
454 && strcmp(encoding, "utf-8") != 0
455 && strcmp(encoding, "utf8") != 0
456 && strcmp(encoding, "ascii") != 0)
457 {
458 PyObject *handler = _PyCodec_Lookup(encoding);
459 if (handler == NULL) {
460 return -1;
461 }
462 Py_DECREF(handler);
463 }
464
465 if (errors != NULL
466 // Fast path for the most common built-in error handlers.
467 && strcmp(errors, "strict") != 0
468 && strcmp(errors, "ignore") != 0
469 && strcmp(errors, "replace") != 0
470 && strcmp(errors, "surrogateescape") != 0
471 && strcmp(errors, "surrogatepass") != 0)
472 {
473 PyObject *handler = PyCodec_LookupError(errors);
474 if (handler == NULL) {
475 return -1;
476 }
477 Py_DECREF(handler);
478 }
479 return 0;
480 }
481
482
483 int
484 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
485 {
486 #define CHECK(expr) \
487 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
488
489 assert(op != NULL);
490 CHECK(PyUnicode_Check(op));
491
492 PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
493 int kind = ascii->state.kind;
494
495 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
496 CHECK(kind == PyUnicode_1BYTE_KIND);
497 }
498 else {
499 PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
500 void *data;
501
502 if (ascii->state.compact == 1) {
503 data = compact + 1;
504 CHECK(kind == PyUnicode_1BYTE_KIND
505 || kind == PyUnicode_2BYTE_KIND
506 || kind == PyUnicode_4BYTE_KIND);
507 CHECK(ascii->state.ascii == 0);
508 CHECK(compact->utf8 != data);
509 }
510 else {
511 PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
512
513 data = unicode->data.any;
514 CHECK(kind == PyUnicode_1BYTE_KIND
515 || kind == PyUnicode_2BYTE_KIND
516 || kind == PyUnicode_4BYTE_KIND);
517 CHECK(ascii->state.compact == 0);
518 CHECK(data != NULL);
519 if (ascii->state.ascii) {
520 CHECK(compact->utf8 == data);
521 CHECK(compact->utf8_length == ascii->length);
522 }
523 else {
524 CHECK(compact->utf8 != data);
525 }
526 }
527
528 if (compact->utf8 == NULL)
529 CHECK(compact->utf8_length == 0);
530 }
531
532 /* check that the best kind is used: O(n) operation */
533 if (check_content) {
534 Py_ssize_t i;
535 Py_UCS4 maxchar = 0;
536 const void *data;
537 Py_UCS4 ch;
538
539 data = PyUnicode_DATA(ascii);
540 for (i=0; i < ascii->length; i++)
541 {
542 ch = PyUnicode_READ(kind, data, i);
543 if (ch > maxchar)
544 maxchar = ch;
545 }
546 if (kind == PyUnicode_1BYTE_KIND) {
547 if (ascii->state.ascii == 0) {
548 CHECK(maxchar >= 128);
549 CHECK(maxchar <= 255);
550 }
551 else
552 CHECK(maxchar < 128);
553 }
554 else if (kind == PyUnicode_2BYTE_KIND) {
555 CHECK(maxchar >= 0x100);
556 CHECK(maxchar <= 0xFFFF);
557 }
558 else {
559 CHECK(maxchar >= 0x10000);
560 CHECK(maxchar <= MAX_UNICODE);
561 }
562 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
563 }
564 return 1;
565
566 #undef CHECK
567 }
568
569 static PyObject*
570 78515081 unicode_result(PyObject *unicode)
571 {
572 assert(_PyUnicode_CHECK(unicode));
573
574 78515081 Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
575
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 78515081 times.
78515081 if (length == 0) {
576 PyObject *empty = unicode_get_empty();
577 if (unicode != empty) {
578 Py_DECREF(unicode);
579 Py_INCREF(empty);
580 }
581 return empty;
582 }
583
584
2/2
✓ Branch 0 taken 1450199 times.
✓ Branch 1 taken 77064882 times.
78515081 if (length == 1) {
585 1450199 int kind = PyUnicode_KIND(unicode);
586
2/2
✓ Branch 0 taken 238170 times.
✓ Branch 1 taken 1212029 times.
1450199 if (kind == PyUnicode_1BYTE_KIND) {
587 238170 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
588 238170 Py_UCS1 ch = data[0];
589
2/2
✓ Branch 0 taken 176151 times.
✓ Branch 1 taken 62019 times.
238170 PyObject *latin1_char = LATIN1(ch);
590
1/2
✓ Branch 0 taken 238170 times.
✗ Branch 1 not taken.
238170 if (unicode != latin1_char) {
591 238170 Py_INCREF(latin1_char);
592 238170 Py_DECREF(unicode);
593 }
594 238170 return latin1_char;
595 }
596 }
597
598 assert(_PyUnicode_CheckConsistency(unicode, 1));
599 78276911 return unicode;
600 }
601
602 static PyObject*
603 29460407 unicode_result_unchanged(PyObject *unicode)
604 {
605
2/2
✓ Branch 1 taken 28449799 times.
✓ Branch 2 taken 1010608 times.
29460407 if (PyUnicode_CheckExact(unicode)) {
606 28449799 Py_INCREF(unicode);
607 28449799 return unicode;
608 }
609 else
610 /* Subtype -- return genuine unicode string with the same value. */
611 1010608 return _PyUnicode_Copy(unicode);
612 }
613
614 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
615 ASCII, Latin1, UTF-8, etc. */
616 static char*
617 backslashreplace(_PyBytesWriter *writer, char *str,
618 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
619 {
620 Py_ssize_t size, i;
621 Py_UCS4 ch;
622 int kind;
623 const void *data;
624
625 kind = PyUnicode_KIND(unicode);
626 data = PyUnicode_DATA(unicode);
627
628 size = 0;
629 /* determine replacement size */
630 for (i = collstart; i < collend; ++i) {
631 Py_ssize_t incr;
632
633 ch = PyUnicode_READ(kind, data, i);
634 if (ch < 0x100)
635 incr = 2+2;
636 else if (ch < 0x10000)
637 incr = 2+4;
638 else {
639 assert(ch <= MAX_UNICODE);
640 incr = 2+8;
641 }
642 if (size > PY_SSIZE_T_MAX - incr) {
643 PyErr_SetString(PyExc_OverflowError,
644 "encoded result is too long for a Python string");
645 return NULL;
646 }
647 size += incr;
648 }
649
650 str = _PyBytesWriter_Prepare(writer, str, size);
651 if (str == NULL)
652 return NULL;
653
654 /* generate replacement */
655 for (i = collstart; i < collend; ++i) {
656 ch = PyUnicode_READ(kind, data, i);
657 *str++ = '\\';
658 if (ch >= 0x00010000) {
659 *str++ = 'U';
660 *str++ = Py_hexdigits[(ch>>28)&0xf];
661 *str++ = Py_hexdigits[(ch>>24)&0xf];
662 *str++ = Py_hexdigits[(ch>>20)&0xf];
663 *str++ = Py_hexdigits[(ch>>16)&0xf];
664 *str++ = Py_hexdigits[(ch>>12)&0xf];
665 *str++ = Py_hexdigits[(ch>>8)&0xf];
666 }
667 else if (ch >= 0x100) {
668 *str++ = 'u';
669 *str++ = Py_hexdigits[(ch>>12)&0xf];
670 *str++ = Py_hexdigits[(ch>>8)&0xf];
671 }
672 else
673 *str++ = 'x';
674 *str++ = Py_hexdigits[(ch>>4)&0xf];
675 *str++ = Py_hexdigits[ch&0xf];
676 }
677 return str;
678 }
679
680 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
681 ASCII, Latin1, UTF-8, etc. */
682 static char*
683 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
684 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
685 {
686 Py_ssize_t size, i;
687 Py_UCS4 ch;
688 int kind;
689 const void *data;
690
691 kind = PyUnicode_KIND(unicode);
692 data = PyUnicode_DATA(unicode);
693
694 size = 0;
695 /* determine replacement size */
696 for (i = collstart; i < collend; ++i) {
697 Py_ssize_t incr;
698
699 ch = PyUnicode_READ(kind, data, i);
700 if (ch < 10)
701 incr = 2+1+1;
702 else if (ch < 100)
703 incr = 2+2+1;
704 else if (ch < 1000)
705 incr = 2+3+1;
706 else if (ch < 10000)
707 incr = 2+4+1;
708 else if (ch < 100000)
709 incr = 2+5+1;
710 else if (ch < 1000000)
711 incr = 2+6+1;
712 else {
713 assert(ch <= MAX_UNICODE);
714 incr = 2+7+1;
715 }
716 if (size > PY_SSIZE_T_MAX - incr) {
717 PyErr_SetString(PyExc_OverflowError,
718 "encoded result is too long for a Python string");
719 return NULL;
720 }
721 size += incr;
722 }
723
724 str = _PyBytesWriter_Prepare(writer, str, size);
725 if (str == NULL)
726 return NULL;
727
728 /* generate replacement */
729 for (i = collstart; i < collend; ++i) {
730 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
731 if (size < 0) {
732 return NULL;
733 }
734 str += size;
735 }
736 return str;
737 }
738
739 /* --- Bloom Filters ----------------------------------------------------- */
740
741 /* stuff to implement simple "bloom filters" for Unicode characters.
742 to keep things simple, we use a single bitmask, using the least 5
743 bits from each unicode characters as the bit index. */
744
745 /* the linebreak mask is set up by _PyUnicode_Init() below */
746
747 #if LONG_BIT >= 128
748 #define BLOOM_WIDTH 128
749 #elif LONG_BIT >= 64
750 #define BLOOM_WIDTH 64
751 #elif LONG_BIT >= 32
752 #define BLOOM_WIDTH 32
753 #else
754 #error "LONG_BIT is smaller than 32"
755 #endif
756
757 #define BLOOM_MASK unsigned long
758
759 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
760
761 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
762
763 #define BLOOM_LINEBREAK(ch) \
764 ((ch) < 128U ? ascii_linebreak[(ch)] : \
765 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
766
767 static inline BLOOM_MASK
768 9357443 make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
769 {
770 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
771 do { \
772 TYPE *data = (TYPE *)PTR; \
773 TYPE *end = data + LEN; \
774 Py_UCS4 ch; \
775 for (; data != end; data++) { \
776 ch = *data; \
777 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
778 } \
779 break; \
780 } while (0)
781
782 /* calculate simple bloom-style bitmask for a given unicode string */
783
784 BLOOM_MASK mask;
785
786 9357443 mask = 0;
787
2/4
✓ Branch 0 taken 9354035 times.
✓ Branch 1 taken 3408 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
9357443 switch (kind) {
788 9354035 case PyUnicode_1BYTE_KIND:
789
2/2
✓ Branch 0 taken 10036123 times.
✓ Branch 1 taken 9354035 times.
19390158 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
790 9354035 break;
791 3408 case PyUnicode_2BYTE_KIND:
792
2/2
✓ Branch 0 taken 27264 times.
✓ Branch 1 taken 3408 times.
30672 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
793 3408 break;
794 case PyUnicode_4BYTE_KIND:
795 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
796 break;
797 default:
798 Py_UNREACHABLE();
799 }
800 9357443 return mask;
801
802 #undef BLOOM_UPDATE
803 }
804
805 static int
806 223302753 ensure_unicode(PyObject *obj)
807 {
808
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 223302753 times.
223302753 if (!PyUnicode_Check(obj)) {
809 PyErr_Format(PyExc_TypeError,
810 "must be str, not %.100s",
811 Py_TYPE(obj)->tp_name);
812 return -1;
813 }
814 223302753 return 0;
815 }
816
817 /* Compilation of templated routines */
818
819 #define STRINGLIB_GET_EMPTY() unicode_get_empty()
820
821 #include "stringlib/asciilib.h"
822 #include "stringlib/fastsearch.h"
823 #include "stringlib/partition.h"
824 #include "stringlib/split.h"
825 #include "stringlib/count.h"
826 #include "stringlib/find.h"
827 #include "stringlib/find_max_char.h"
828 #include "stringlib/undef.h"
829
830 #include "stringlib/ucs1lib.h"
831 #include "stringlib/fastsearch.h"
832 #include "stringlib/partition.h"
833 #include "stringlib/split.h"
834 #include "stringlib/count.h"
835 #include "stringlib/find.h"
836 #include "stringlib/replace.h"
837 #include "stringlib/find_max_char.h"
838 #include "stringlib/undef.h"
839
840 #include "stringlib/ucs2lib.h"
841 #include "stringlib/fastsearch.h"
842 #include "stringlib/partition.h"
843 #include "stringlib/split.h"
844 #include "stringlib/count.h"
845 #include "stringlib/find.h"
846 #include "stringlib/replace.h"
847 #include "stringlib/find_max_char.h"
848 #include "stringlib/undef.h"
849
850 #include "stringlib/ucs4lib.h"
851 #include "stringlib/fastsearch.h"
852 #include "stringlib/partition.h"
853 #include "stringlib/split.h"
854 #include "stringlib/count.h"
855 #include "stringlib/find.h"
856 #include "stringlib/replace.h"
857 #include "stringlib/find_max_char.h"
858 #include "stringlib/undef.h"
859
860 #undef STRINGLIB_GET_EMPTY
861
862 /* --- Unicode Object ----------------------------------------------------- */
863
864 static inline Py_ssize_t
865 79983429 findchar(const void *s, int kind,
866 Py_ssize_t size, Py_UCS4 ch,
867 int direction)
868 {
869
3/4
✓ Branch 0 taken 79981976 times.
✓ Branch 1 taken 1099 times.
✓ Branch 2 taken 354 times.
✗ Branch 3 not taken.
79983429 switch (kind) {
870 79981976 case PyUnicode_1BYTE_KIND:
871
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 79981976 times.
79981976 if ((Py_UCS1) ch != ch)
872 return -1;
873
2/2
✓ Branch 0 taken 78177159 times.
✓ Branch 1 taken 1804817 times.
79981976 if (direction > 0)
874 78177159 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
875 else
876 1804817 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
877 1099 case PyUnicode_2BYTE_KIND:
878
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1099 times.
1099 if ((Py_UCS2) ch != ch)
879 return -1;
880
1/2
✓ Branch 0 taken 1099 times.
✗ Branch 1 not taken.
1099 if (direction > 0)
881 1099 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
882 else
883 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
884 354 case PyUnicode_4BYTE_KIND:
885
1/2
✓ Branch 0 taken 354 times.
✗ Branch 1 not taken.
354 if (direction > 0)
886 354 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
887 else
888 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
889 default:
890 Py_UNREACHABLE();
891 }
892 }
893
894 #ifdef Py_DEBUG
895 /* Fill the data of a Unicode string with invalid characters to detect bugs
896 earlier.
897
898 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
899 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
900 invalid character in Unicode 6.0. */
901 static void
902 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
903 {
904 int kind = PyUnicode_KIND(unicode);
905 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
906 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
907 if (length <= old_length)
908 return;
909 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
910 }
911 #endif
912
913 static PyObject*
914 76025076 resize_compact(PyObject *unicode, Py_ssize_t length)
915 {
916 Py_ssize_t char_size;
917 Py_ssize_t struct_size;
918 Py_ssize_t new_size;
919 PyObject *new_unicode;
920 #ifdef Py_DEBUG
921 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
922 #endif
923
924 assert(unicode_modifiable(unicode));
925 assert(PyUnicode_IS_COMPACT(unicode));
926
927 76025076 char_size = PyUnicode_KIND(unicode);
928
2/2
✓ Branch 1 taken 74220974 times.
✓ Branch 2 taken 1804102 times.
76025076 if (PyUnicode_IS_ASCII(unicode))
929 74220974 struct_size = sizeof(PyASCIIObject);
930 else
931 1804102 struct_size = sizeof(PyCompactUnicodeObject);
932
933
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 76025076 times.
76025076 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
934 PyErr_NoMemory();
935 return NULL;
936 }
937 76025076 new_size = (struct_size + (length + 1) * char_size);
938
939
3/6
✓ Branch 1 taken 1804102 times.
✓ Branch 2 taken 74220974 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 1804102 times.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
76025076 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
940 PyObject_Free(_PyUnicode_UTF8(unicode));
941 _PyUnicode_UTF8(unicode) = NULL;
942 _PyUnicode_UTF8_LENGTH(unicode) = 0;
943 }
944 #ifdef Py_REF_DEBUG
945 _Py_RefTotal--;
946 #endif
947 #ifdef Py_TRACE_REFS
948 _Py_ForgetReference(unicode);
949 #endif
950
951 76025076 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
952
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 76025076 times.
76025076 if (new_unicode == NULL) {
953 _Py_NewReference(unicode);
954 PyErr_NoMemory();
955 return NULL;
956 }
957 76025076 unicode = new_unicode;
958 76025076 _Py_NewReference(unicode);
959
960 76025076 _PyUnicode_LENGTH(unicode) = length;
961 #ifdef Py_DEBUG
962 unicode_fill_invalid(unicode, old_length);
963 #endif
964 76025076 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
965 length, 0);
966 assert(_PyUnicode_CheckConsistency(unicode, 0));
967 76025076 return unicode;
968 }
969
970 static int
971 resize_inplace(PyObject *unicode, Py_ssize_t length)
972 {
973 assert(!PyUnicode_IS_COMPACT(unicode));
974 assert(Py_REFCNT(unicode) == 1);
975
976 Py_ssize_t new_size;
977 Py_ssize_t char_size;
978 int share_utf8;
979 void *data;
980 #ifdef Py_DEBUG
981 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
982 #endif
983
984 data = _PyUnicode_DATA_ANY(unicode);
985 char_size = PyUnicode_KIND(unicode);
986 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
987
988 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
989 PyErr_NoMemory();
990 return -1;
991 }
992 new_size = (length + 1) * char_size;
993
994 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
995 {
996 PyObject_Free(_PyUnicode_UTF8(unicode));
997 _PyUnicode_UTF8(unicode) = NULL;
998 _PyUnicode_UTF8_LENGTH(unicode) = 0;
999 }
1000
1001 data = (PyObject *)PyObject_Realloc(data, new_size);
1002 if (data == NULL) {
1003 PyErr_NoMemory();
1004 return -1;
1005 }
1006 _PyUnicode_DATA_ANY(unicode) = data;
1007 if (share_utf8) {
1008 _PyUnicode_UTF8(unicode) = data;
1009 _PyUnicode_UTF8_LENGTH(unicode) = length;
1010 }
1011 _PyUnicode_LENGTH(unicode) = length;
1012 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1013 #ifdef Py_DEBUG
1014 unicode_fill_invalid(unicode, old_length);
1015 #endif
1016
1017 /* check for integer overflow */
1018 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1019 PyErr_NoMemory();
1020 return -1;
1021 }
1022 assert(_PyUnicode_CheckConsistency(unicode, 0));
1023 return 0;
1024 }
1025
1026 static PyObject*
1027 resize_copy(PyObject *unicode, Py_ssize_t length)
1028 {
1029 Py_ssize_t copy_length;
1030 PyObject *copy;
1031
1032 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1033 if (copy == NULL)
1034 return NULL;
1035
1036 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1037 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1038 return copy;
1039 }
1040
1041 static const char*
1042 unicode_kind_name(PyObject *unicode)
1043 {
1044 /* don't check consistency: unicode_kind_name() is called from
1045 _PyUnicode_Dump() */
1046 if (!PyUnicode_IS_COMPACT(unicode))
1047 {
1048 switch (PyUnicode_KIND(unicode))
1049 {
1050 case PyUnicode_1BYTE_KIND:
1051 if (PyUnicode_IS_ASCII(unicode))
1052 return "legacy ascii";
1053 else
1054 return "legacy latin1";
1055 case PyUnicode_2BYTE_KIND:
1056 return "legacy UCS2";
1057 case PyUnicode_4BYTE_KIND:
1058 return "legacy UCS4";
1059 default:
1060 return "<legacy invalid kind>";
1061 }
1062 }
1063 switch (PyUnicode_KIND(unicode)) {
1064 case PyUnicode_1BYTE_KIND:
1065 if (PyUnicode_IS_ASCII(unicode))
1066 return "ascii";
1067 else
1068 return "latin1";
1069 case PyUnicode_2BYTE_KIND:
1070 return "UCS2";
1071 case PyUnicode_4BYTE_KIND:
1072 return "UCS4";
1073 default:
1074 return "<invalid compact kind>";
1075 }
1076 }
1077
1078 #ifdef Py_DEBUG
1079 /* Functions wrapping macros for use in debugger */
1080 const char *_PyUnicode_utf8(void *unicode_raw){
1081 PyObject *unicode = _PyObject_CAST(unicode_raw);
1082 return PyUnicode_UTF8(unicode);
1083 }
1084
1085 const void *_PyUnicode_compact_data(void *unicode_raw) {
1086 PyObject *unicode = _PyObject_CAST(unicode_raw);
1087 return _PyUnicode_COMPACT_DATA(unicode);
1088 }
1089 const void *_PyUnicode_data(void *unicode_raw) {
1090 PyObject *unicode = _PyObject_CAST(unicode_raw);
1091 printf("obj %p\n", (void*)unicode);
1092 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1093 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1094 printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1095 printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1096 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1097 return PyUnicode_DATA(unicode);
1098 }
1099
1100 void
1101 _PyUnicode_Dump(PyObject *op)
1102 {
1103 PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1104 PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1105 PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1106 const void *data;
1107
1108 if (ascii->state.compact)
1109 {
1110 if (ascii->state.ascii)
1111 data = (ascii + 1);
1112 else
1113 data = (compact + 1);
1114 }
1115 else
1116 data = unicode->data.any;
1117 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1118
1119 if (!ascii->state.ascii) {
1120 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1121 }
1122 printf(", data=%p\n", data);
1123 }
1124 #endif
1125
1126
1127 PyObject *
1128 419966173 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1129 {
1130 /* Optimization for empty strings */
1131
2/2
✓ Branch 0 taken 3989574 times.
✓ Branch 1 taken 415976599 times.
419966173 if (size == 0) {
1132 3989574 return unicode_new_empty();
1133 }
1134
1135 PyObject *obj;
1136 PyCompactUnicodeObject *unicode;
1137 void *data;
1138 int kind;
1139 int is_ascii;
1140 Py_ssize_t char_size;
1141 Py_ssize_t struct_size;
1142
1143 415976599 is_ascii = 0;
1144 415976599 struct_size = sizeof(PyCompactUnicodeObject);
1145
2/2
✓ Branch 0 taken 412842963 times.
✓ Branch 1 taken 3133636 times.
415976599 if (maxchar < 128) {
1146 412842963 kind = PyUnicode_1BYTE_KIND;
1147 412842963 char_size = 1;
1148 412842963 is_ascii = 1;
1149 412842963 struct_size = sizeof(PyASCIIObject);
1150 }
1151
2/2
✓ Branch 0 taken 128809 times.
✓ Branch 1 taken 3004827 times.
3133636 else if (maxchar < 256) {
1152 128809 kind = PyUnicode_1BYTE_KIND;
1153 128809 char_size = 1;
1154 }
1155
2/2
✓ Branch 0 taken 2220763 times.
✓ Branch 1 taken 784064 times.
3004827 else if (maxchar < 65536) {
1156 2220763 kind = PyUnicode_2BYTE_KIND;
1157 2220763 char_size = 2;
1158 }
1159 else {
1160
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 784064 times.
784064 if (maxchar > MAX_UNICODE) {
1161 PyErr_SetString(PyExc_SystemError,
1162 "invalid maximum character passed to PyUnicode_New");
1163 return NULL;
1164 }
1165 784064 kind = PyUnicode_4BYTE_KIND;
1166 784064 char_size = 4;
1167 }
1168
1169 /* Ensure we won't overflow the size. */
1170
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 415976599 times.
415976599 if (size < 0) {
1171 PyErr_SetString(PyExc_SystemError,
1172 "Negative size passed to PyUnicode_New");
1173 return NULL;
1174 }
1175
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 415976599 times.
415976599 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1176 return PyErr_NoMemory();
1177
1178 /* Duplicated allocation code from _PyObject_New() instead of a call to
1179 * PyObject_New() so we are able to allocate space for the object and
1180 * it's data buffer.
1181 */
1182 415976599 obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1183
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 415976599 times.
415976599 if (obj == NULL) {
1184 return PyErr_NoMemory();
1185 }
1186 415976599 _PyObject_Init(obj, &PyUnicode_Type);
1187
1188 415976599 unicode = (PyCompactUnicodeObject *)obj;
1189
2/2
✓ Branch 0 taken 412842963 times.
✓ Branch 1 taken 3133636 times.
415976599 if (is_ascii)
1190 412842963 data = ((PyASCIIObject*)obj) + 1;
1191 else
1192 3133636 data = unicode + 1;
1193 415976599 _PyUnicode_LENGTH(unicode) = size;
1194 415976599 _PyUnicode_HASH(unicode) = -1;
1195 415976599 _PyUnicode_STATE(unicode).interned = 0;
1196 415976599 _PyUnicode_STATE(unicode).kind = kind;
1197 415976599 _PyUnicode_STATE(unicode).compact = 1;
1198 415976599 _PyUnicode_STATE(unicode).ascii = is_ascii;
1199
2/2
✓ Branch 0 taken 412842963 times.
✓ Branch 1 taken 3133636 times.
415976599 if (is_ascii) {
1200 412842963 ((char*)data)[size] = 0;
1201 }
1202
2/2
✓ Branch 0 taken 128809 times.
✓ Branch 1 taken 3004827 times.
3133636 else if (kind == PyUnicode_1BYTE_KIND) {
1203 128809 ((char*)data)[size] = 0;
1204 128809 unicode->utf8 = NULL;
1205 128809 unicode->utf8_length = 0;
1206 }
1207 else {
1208 3004827 unicode->utf8 = NULL;
1209 3004827 unicode->utf8_length = 0;
1210
2/2
✓ Branch 0 taken 2220763 times.
✓ Branch 1 taken 784064 times.
3004827 if (kind == PyUnicode_2BYTE_KIND)
1211 2220763 ((Py_UCS2*)data)[size] = 0;
1212 else /* kind == PyUnicode_4BYTE_KIND */
1213 784064 ((Py_UCS4*)data)[size] = 0;
1214 }
1215 #ifdef Py_DEBUG
1216 unicode_fill_invalid((PyObject*)unicode, 0);
1217 #endif
1218 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1219 415976599 return obj;
1220 }
1221
1222 #if SIZEOF_WCHAR_T == 2
1223 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1224 will decode surrogate pairs, the other conversions are implemented as macros
1225 for efficiency.
1226
1227 This function assumes that unicode can hold one more code point than wstr
1228 characters for a terminating null character. */
1229 static void
1230 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1231 PyObject *unicode)
1232 {
1233 const wchar_t *iter;
1234 Py_UCS4 *ucs4_out;
1235
1236 assert(unicode != NULL);
1237 assert(_PyUnicode_CHECK(unicode));
1238 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1239 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1240
1241 for (iter = begin; iter < end; ) {
1242 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1243 _PyUnicode_GET_LENGTH(unicode)));
1244 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1245 && (iter+1) < end
1246 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1247 {
1248 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1249 iter += 2;
1250 }
1251 else {
1252 *ucs4_out++ = *iter;
1253 iter++;
1254 }
1255 }
1256 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1257 _PyUnicode_GET_LENGTH(unicode)));
1258
1259 }
1260 #endif
1261
1262 static int
1263 8830 unicode_check_modifiable(PyObject *unicode)
1264 {
1265
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 8830 times.
8830 if (!unicode_modifiable(unicode)) {
1266 PyErr_SetString(PyExc_SystemError,
1267 "Cannot modify a string currently used");
1268 return -1;
1269 }
1270 8830 return 0;
1271 }
1272
1273 static int
1274 210108881 _copy_characters(PyObject *to, Py_ssize_t to_start,
1275 PyObject *from, Py_ssize_t from_start,
1276 Py_ssize_t how_many, int check_maxchar)
1277 {
1278 int from_kind, to_kind;
1279 const void *from_data;
1280 void *to_data;
1281
1282 assert(0 <= how_many);
1283 assert(0 <= from_start);
1284 assert(0 <= to_start);
1285 assert(PyUnicode_Check(from));
1286 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1287
1288 assert(PyUnicode_Check(to));
1289 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1290
1291
2/2
✓ Branch 0 taken 1744090 times.
✓ Branch 1 taken 208364791 times.
210108881 if (how_many == 0)
1292 1744090 return 0;
1293
1294 208364791 from_kind = PyUnicode_KIND(from);
1295 208364791 from_data = PyUnicode_DATA(from);
1296 208364791 to_kind = PyUnicode_KIND(to);
1297 208364791 to_data = PyUnicode_DATA(to);
1298
1299 #ifdef Py_DEBUG
1300 if (!check_maxchar
1301 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1302 {
1303 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1304 Py_UCS4 ch;
1305 Py_ssize_t i;
1306 for (i=0; i < how_many; i++) {
1307 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1308 assert(ch <= to_maxchar);
1309 }
1310 }
1311 #endif
1312
1313
2/2
✓ Branch 0 taken 207642767 times.
✓ Branch 1 taken 722024 times.
208364791 if (from_kind == to_kind) {
1314
2/2
✓ Branch 0 taken 3676 times.
✓ Branch 1 taken 207639091 times.
207642767 if (check_maxchar
1315
1/4
✗ Branch 1 not taken.
✓ Branch 2 taken 3676 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
3676 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1316 {
1317 /* Writing Latin-1 characters into an ASCII string requires to
1318 check that all written characters are pure ASCII */
1319 Py_UCS4 max_char;
1320 max_char = ucs1lib_find_max_char(from_data,
1321 (const Py_UCS1*)from_data + how_many);
1322 if (max_char >= 128)
1323 return -1;
1324 }
1325 207642767 memcpy((char*)to_data + to_kind * to_start,
1326 207642767 (const char*)from_data + from_kind * from_start,
1327 207642767 to_kind * how_many);
1328 }
1329
2/2
✓ Branch 0 taken 712054 times.
✓ Branch 1 taken 9970 times.
722024 else if (from_kind == PyUnicode_1BYTE_KIND
1330
2/2
✓ Branch 0 taken 240263 times.
✓ Branch 1 taken 471791 times.
712054 && to_kind == PyUnicode_2BYTE_KIND)
1331 {
1332
4/4
✓ Branch 3 taken 2168694 times.
✓ Branch 4 taken 240263 times.
✓ Branch 5 taken 303436 times.
✓ Branch 6 taken 240263 times.
2712393 _PyUnicode_CONVERT_BYTES(
1333 Py_UCS1, Py_UCS2,
1334 PyUnicode_1BYTE_DATA(from) + from_start,
1335 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1336 PyUnicode_2BYTE_DATA(to) + to_start
1337 );
1338 }
1339
2/2
✓ Branch 0 taken 471791 times.
✓ Branch 1 taken 9970 times.
481761 else if (from_kind == PyUnicode_1BYTE_KIND
1340
1/2
✓ Branch 0 taken 471791 times.
✗ Branch 1 not taken.
471791 && to_kind == PyUnicode_4BYTE_KIND)
1341 {
1342
4/4
✓ Branch 3 taken 3905631 times.
✓ Branch 4 taken 471791 times.
✓ Branch 5 taken 615208 times.
✓ Branch 6 taken 471791 times.
4992630 _PyUnicode_CONVERT_BYTES(
1343 Py_UCS1, Py_UCS4,
1344 PyUnicode_1BYTE_DATA(from) + from_start,
1345 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1346 PyUnicode_4BYTE_DATA(to) + to_start
1347 );
1348 }
1349
1/2
✓ Branch 0 taken 9970 times.
✗ Branch 1 not taken.
9970 else if (from_kind == PyUnicode_2BYTE_KIND
1350
1/2
✓ Branch 0 taken 9970 times.
✗ Branch 1 not taken.
9970 && to_kind == PyUnicode_4BYTE_KIND)
1351 {
1352
4/4
✓ Branch 3 taken 2914 times.
✓ Branch 4 taken 9970 times.
✓ Branch 5 taken 10357 times.
✓ Branch 6 taken 9970 times.
23241 _PyUnicode_CONVERT_BYTES(
1353 Py_UCS2, Py_UCS4,
1354 PyUnicode_2BYTE_DATA(from) + from_start,
1355 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1356 PyUnicode_4BYTE_DATA(to) + to_start
1357 );
1358 }
1359 else {
1360 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1361
1362 if (!check_maxchar) {
1363 if (from_kind == PyUnicode_2BYTE_KIND
1364 && to_kind == PyUnicode_1BYTE_KIND)
1365 {
1366 _PyUnicode_CONVERT_BYTES(
1367 Py_UCS2, Py_UCS1,
1368 PyUnicode_2BYTE_DATA(from) + from_start,
1369 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1370 PyUnicode_1BYTE_DATA(to) + to_start
1371 );
1372 }
1373 else if (from_kind == PyUnicode_4BYTE_KIND
1374 && to_kind == PyUnicode_1BYTE_KIND)
1375 {
1376 _PyUnicode_CONVERT_BYTES(
1377 Py_UCS4, Py_UCS1,
1378 PyUnicode_4BYTE_DATA(from) + from_start,
1379 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1380 PyUnicode_1BYTE_DATA(to) + to_start
1381 );
1382 }
1383 else if (from_kind == PyUnicode_4BYTE_KIND
1384 && to_kind == PyUnicode_2BYTE_KIND)
1385 {
1386 _PyUnicode_CONVERT_BYTES(
1387 Py_UCS4, Py_UCS2,
1388 PyUnicode_4BYTE_DATA(from) + from_start,
1389 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1390 PyUnicode_2BYTE_DATA(to) + to_start
1391 );
1392 }
1393 else {
1394 Py_UNREACHABLE();
1395 }
1396 }
1397 else {
1398 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1399 Py_UCS4 ch;
1400 Py_ssize_t i;
1401
1402 for (i=0; i < how_many; i++) {
1403 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1404 if (ch > to_maxchar)
1405 return -1;
1406 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1407 }
1408 }
1409 }
1410 208364791 return 0;
1411 }
1412
1413 void
1414 210105205 _PyUnicode_FastCopyCharacters(
1415 PyObject *to, Py_ssize_t to_start,
1416 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1417 {
1418 210105205 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1419 210105205 }
1420
1421 Py_ssize_t
1422 3676 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1423 PyObject *from, Py_ssize_t from_start,
1424 Py_ssize_t how_many)
1425 {
1426 int err;
1427
1428
2/4
✓ Branch 2 taken 3676 times.
✗ Branch 3 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 3676 times.
3676 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1429 PyErr_BadInternalCall();
1430 return -1;
1431 }
1432
1433
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3676 times.
3676 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1434 PyErr_SetString(PyExc_IndexError, "string index out of range");
1435 return -1;
1436 }
1437
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3676 times.
3676 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1438 PyErr_SetString(PyExc_IndexError, "string index out of range");
1439 return -1;
1440 }
1441
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3676 times.
3676 if (how_many < 0) {
1442 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1443 return -1;
1444 }
1445
1/2
✓ Branch 1 taken 3676 times.
✗ Branch 2 not taken.
3676 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1446
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3676 times.
3676 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1447 PyErr_Format(PyExc_SystemError,
1448 "Cannot write %zi characters at %zi "
1449 "in a string of %zi characters",
1450 how_many, to_start, PyUnicode_GET_LENGTH(to));
1451 return -1;
1452 }
1453
1454
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3676 times.
3676 if (how_many == 0)
1455 return 0;
1456
1457
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3676 times.
3676 if (unicode_check_modifiable(to))
1458 return -1;
1459
1460 3676 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1461
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3676 times.
3676 if (err) {
1462 PyErr_Format(PyExc_SystemError,
1463 "Cannot copy %s characters "
1464 "into a string of %s characters",
1465 unicode_kind_name(from),
1466 unicode_kind_name(to));
1467 return -1;
1468 }
1469 3676 return how_many;
1470 }
1471
1472 /* Find the maximum code point and count the number of surrogate pairs so a
1473 correct string length can be computed before converting a string to UCS4.
1474 This function counts single surrogates as a character and not as a pair.
1475
1476 Return 0 on success, or -1 on error. */
1477 static int
1478 2015053 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1479 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1480 {
1481 const wchar_t *iter;
1482 Py_UCS4 ch;
1483
1484 assert(num_surrogates != NULL && maxchar != NULL);
1485 2015053 *num_surrogates = 0;
1486 2015053 *maxchar = 0;
1487
1488
2/2
✓ Branch 0 taken 48213047 times.
✓ Branch 1 taken 2015053 times.
50228100 for (iter = begin; iter < end; ) {
1489 #if SIZEOF_WCHAR_T == 2
1490 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1491 && (iter+1) < end
1492 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1493 {
1494 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1495 ++(*num_surrogates);
1496 iter += 2;
1497 }
1498 else
1499 #endif
1500 {
1501 48213047 ch = *iter;
1502 48213047 iter++;
1503 }
1504
2/2
✓ Branch 0 taken 7473126 times.
✓ Branch 1 taken 40739921 times.
48213047 if (ch > *maxchar) {
1505 7473126 *maxchar = ch;
1506
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 7473126 times.
7473126 if (*maxchar > MAX_UNICODE) {
1507 PyErr_Format(PyExc_ValueError,
1508 "character U+%x is not in range [U+0000; U+%x]",
1509 ch, MAX_UNICODE);
1510 return -1;
1511 }
1512 }
1513 }
1514 2015053 return 0;
1515 }
1516
1517 static void
1518 417114522 unicode_dealloc(PyObject *unicode)
1519 {
1520 #ifdef Py_DEBUG
1521 if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1522 _Py_FatalRefcountError("deallocating an Unicode singleton");
1523 }
1524 #endif
1525
1526
2/2
✓ Branch 1 taken 29085823 times.
✓ Branch 2 taken 388028699 times.
417114522 if (PyUnicode_CHECK_INTERNED(unicode)) {
1527 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1528 references (key and value) which were ignored by
1529 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1530 to prevent calling unicode_dealloc() again. Adjust refcnt after
1531 PyDict_DelItem(). */
1532 assert(Py_REFCNT(unicode) == 0);
1533 29085823 Py_SET_REFCNT(unicode, 3);
1534
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 29085823 times.
29085823 if (PyDict_DelItem(interned, unicode) != 0) {
1535 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1536 NULL);
1537 }
1538 assert(Py_REFCNT(unicode) == 1);
1539 29085823 Py_SET_REFCNT(unicode, 0);
1540 }
1541
1542
6/6
✓ Branch 1 taken 6026732 times.
✓ Branch 2 taken 411087790 times.
✓ Branch 3 taken 2894182 times.
✓ Branch 4 taken 3132550 times.
✓ Branch 6 taken 1086 times.
✓ Branch 7 taken 2893096 times.
417114522 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1543 1086 PyObject_Free(_PyUnicode_UTF8(unicode));
1544 }
1545
3/4
✓ Branch 1 taken 2893096 times.
✓ Branch 2 taken 414221426 times.
✓ Branch 3 taken 2893096 times.
✗ Branch 4 not taken.
417114522 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1546 2893096 PyObject_Free(_PyUnicode_DATA_ANY(unicode));
1547 }
1548
1549 417114522 Py_TYPE(unicode)->tp_free(unicode);
1550 417114522 }
1551
1552 #ifdef Py_DEBUG
1553 static int
1554 unicode_is_singleton(PyObject *unicode)
1555 {
1556 if (unicode == &_Py_STR(empty)) {
1557 return 1;
1558 }
1559
1560 PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1561 if (ascii->length == 1) {
1562 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1563 if (ch < 256 && LATIN1(ch) == unicode) {
1564 return 1;
1565 }
1566 }
1567 return 0;
1568 }
1569 #endif
1570
1571 static int
1572 2955865 unicode_modifiable(PyObject *unicode)
1573 {
1574 assert(_PyUnicode_CHECK(unicode));
1575
2/2
✓ Branch 1 taken 1861751 times.
✓ Branch 2 taken 1094114 times.
2955865 if (Py_REFCNT(unicode) != 1)
1576 1861751 return 0;
1577
2/2
✓ Branch 0 taken 1950 times.
✓ Branch 1 taken 1092164 times.
1094114 if (_PyUnicode_HASH(unicode) != -1)
1578 1950 return 0;
1579
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 1092164 times.
1092164 if (PyUnicode_CHECK_INTERNED(unicode))
1580 return 0;
1581
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 1092164 times.
1092164 if (!PyUnicode_CheckExact(unicode))
1582 return 0;
1583 #ifdef Py_DEBUG
1584 /* singleton refcount is greater than 1 */
1585 assert(!unicode_is_singleton(unicode));
1586 #endif
1587 1092164 return 1;
1588 }
1589
1590 static int
1591 541655 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1592 {
1593 PyObject *unicode;
1594 Py_ssize_t old_length;
1595
1596 assert(p_unicode != NULL);
1597 541655 unicode = *p_unicode;
1598
1599 assert(unicode != NULL);
1600 assert(PyUnicode_Check(unicode));
1601 assert(0 <= length);
1602
1603 541655 old_length = PyUnicode_GET_LENGTH(unicode);
1604
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 541655 times.
541655 if (old_length == length)
1605 return 0;
1606
1607
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 541655 times.
541655 if (length == 0) {
1608 PyObject *empty = unicode_new_empty();
1609 Py_SETREF(*p_unicode, empty);
1610 return 0;
1611 }
1612
1613
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 541655 times.
541655 if (!unicode_modifiable(unicode)) {
1614 PyObject *copy = resize_copy(unicode, length);
1615 if (copy == NULL)
1616 return -1;
1617 Py_SETREF(*p_unicode, copy);
1618 return 0;
1619 }
1620
1621
1/2
✓ Branch 1 taken 541655 times.
✗ Branch 2 not taken.
541655 if (PyUnicode_IS_COMPACT(unicode)) {
1622 541655 PyObject *new_unicode = resize_compact(unicode, length);
1623
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 541655 times.
541655 if (new_unicode == NULL)
1624 return -1;
1625 541655 *p_unicode = new_unicode;
1626 541655 return 0;
1627 }
1628 return resize_inplace(unicode, length);
1629 }
1630
1631 int
1632 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1633 {
1634 PyObject *unicode;
1635 if (p_unicode == NULL) {
1636 PyErr_BadInternalCall();
1637 return -1;
1638 }
1639 unicode = *p_unicode;
1640 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1641 {
1642 PyErr_BadInternalCall();
1643 return -1;
1644 }
1645 return unicode_resize(p_unicode, length);
1646 }
1647
1648 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
1649
1650 WARNING: The function doesn't copy the terminating null character and
1651 doesn't check the maximum character (may write a latin1 character in an
1652 ASCII string). */
1653 static void
1654 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1655 const char *str, Py_ssize_t len)
1656 {
1657 int kind = PyUnicode_KIND(unicode);
1658 const void *data = PyUnicode_DATA(unicode);
1659 const char *end = str + len;
1660
1661 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1662 switch (kind) {
1663 case PyUnicode_1BYTE_KIND: {
1664 #ifdef Py_DEBUG
1665 if (PyUnicode_IS_ASCII(unicode)) {
1666 Py_UCS4 maxchar = ucs1lib_find_max_char(
1667 (const Py_UCS1*)str,
1668 (const Py_UCS1*)str + len);
1669 assert(maxchar < 128);
1670 }
1671 #endif
1672 memcpy((char *) data + index, str, len);
1673 break;
1674 }
1675 case PyUnicode_2BYTE_KIND: {
1676 Py_UCS2 *start = (Py_UCS2 *)data + index;
1677 Py_UCS2 *ucs2 = start;
1678
1679 for (; str < end; ++ucs2, ++str)
1680 *ucs2 = (Py_UCS2)*str;
1681
1682 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1683 break;
1684 }
1685 case PyUnicode_4BYTE_KIND: {
1686 Py_UCS4 *start = (Py_UCS4 *)data + index;
1687 Py_UCS4 *ucs4 = start;
1688
1689 for (; str < end; ++ucs4, ++str)
1690 *ucs4 = (Py_UCS4)*str;
1691
1692 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1693 break;
1694 }
1695 default:
1696 Py_UNREACHABLE();
1697 }
1698 }
1699
1700 static PyObject*
1701 70434497 get_latin1_char(Py_UCS1 ch)
1702 {
1703
2/2
✓ Branch 0 taken 70035679 times.
✓ Branch 1 taken 398818 times.
70434497 return Py_NewRef(LATIN1(ch));
1704 }
1705
1706 static PyObject*
1707 57517379 unicode_char(Py_UCS4 ch)
1708 {
1709 PyObject *unicode;
1710
1711 assert(ch <= MAX_UNICODE);
1712
1713
2/2
✓ Branch 0 taken 56273566 times.
✓ Branch 1 taken 1243813 times.
57517379 if (ch < 256) {
1714 56273566 return get_latin1_char(ch);
1715 }
1716
1717 1243813 unicode = PyUnicode_New(1, ch);
1718
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1243813 times.
1243813 if (unicode == NULL)
1719 return NULL;
1720
1721 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1722
2/2
✓ Branch 0 taken 1185785 times.
✓ Branch 1 taken 58028 times.
1243813 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1723 1185785 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1724 } else {
1725 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1726 58028 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1727 }
1728 assert(_PyUnicode_CheckConsistency(unicode, 1));
1729 1243813 return unicode;
1730 }
1731
1732 PyObject *
1733 2042078 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1734 {
1735 PyObject *unicode;
1736 2042078 Py_UCS4 maxchar = 0;
1737 Py_ssize_t num_surrogates;
1738
1739
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2042078 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
2042078 if (u == NULL && size != 0) {
1740 PyErr_BadInternalCall();
1741 return NULL;
1742 }
1743
1744
2/2
✓ Branch 0 taken 381188 times.
✓ Branch 1 taken 1660890 times.
2042078 if (size == -1) {
1745 381188 size = wcslen(u);
1746 }
1747
1748 /* If the Unicode data is known at construction time, we can apply
1749 some optimizations which share commonly used objects. */
1750
1751 /* Optimization for empty strings */
1752
2/2
✓ Branch 0 taken 961 times.
✓ Branch 1 taken 2041117 times.
2042078 if (size == 0)
1753 961 _Py_RETURN_UNICODE_EMPTY();
1754
1755 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1756 /* Oracle Solaris uses non-Unicode internal wchar_t form for
1757 non-Unicode locales and hence needs conversion to UCS-4 first. */
1758 if (_Py_LocaleUsesNonUnicodeWchar()) {
1759 wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1760 if (!converted) {
1761 return NULL;
1762 }
1763 PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1764 PyMem_Free(converted);
1765 return unicode;
1766 }
1767 #endif
1768
1769 /* Single character Unicode objects in the Latin-1 range are
1770 shared when using this constructor */
1771
3/4
✓ Branch 0 taken 26064 times.
✓ Branch 1 taken 2015053 times.
✓ Branch 2 taken 26064 times.
✗ Branch 3 not taken.
2041117 if (size == 1 && (Py_UCS4)*u < 256)
1772 26064 return get_latin1_char((unsigned char)*u);
1773
1774 /* If not empty and not single character, copy the Unicode data
1775 into the new object */
1776
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 2015053 times.
2015053 if (find_maxchar_surrogates(u, u + size,
1777 &maxchar, &num_surrogates) == -1)
1778 return NULL;
1779
1780 2015053 unicode = PyUnicode_New(size - num_surrogates, maxchar);
1781
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2015053 times.
2015053 if (!unicode)
1782 return NULL;
1783
1784
1/4
✓ Branch 0 taken 2015053 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
2015053 switch (PyUnicode_KIND(unicode)) {
1785 2015053 case PyUnicode_1BYTE_KIND:
1786
4/4
✓ Branch 1 taken 11405899 times.
✓ Branch 2 taken 2015053 times.
✓ Branch 3 taken 2589451 times.
✓ Branch 4 taken 2015053 times.
16010403 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1787 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1788 2015053 break;
1789 case PyUnicode_2BYTE_KIND:
1790 #if Py_UNICODE_SIZE == 2
1791 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1792 #else
1793 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1794 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1795 #endif
1796 break;
1797 case PyUnicode_4BYTE_KIND:
1798 #if SIZEOF_WCHAR_T == 2
1799 /* This is the only case which has to process surrogates, thus
1800 a simple copy loop is not enough and we need a function. */
1801 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1802 #else
1803 assert(num_surrogates == 0);
1804 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1805 #endif
1806 break;
1807 default:
1808 Py_UNREACHABLE();
1809 }
1810
1811 2015053 return unicode_result(unicode);
1812 }
1813
1814 PyObject *
1815 3341180 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1816 {
1817
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3341180 times.
3341180 if (size < 0) {
1818 PyErr_SetString(PyExc_SystemError,
1819 "Negative size passed to PyUnicode_FromStringAndSize");
1820 return NULL;
1821 }
1822
2/2
✓ Branch 0 taken 3307409 times.
✓ Branch 1 taken 33771 times.
3341180 if (u != NULL) {
1823 3307409 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1824 }
1825
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 33771 times.
33771 if (size > 0) {
1826 PyErr_SetString(PyExc_SystemError,
1827 "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
1828 return NULL;
1829 }
1830 33771 return unicode_new_empty();
1831 }
1832
1833 PyObject *
1834 45264200 PyUnicode_FromString(const char *u)
1835 {
1836 45264200 size_t size = strlen(u);
1837
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 45264200 times.
45264200 if (size > PY_SSIZE_T_MAX) {
1838 PyErr_SetString(PyExc_OverflowError, "input too long");
1839 return NULL;
1840 }
1841 45264200 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1842 }
1843
1844
1845 PyObject *
1846 107451647 _PyUnicode_FromId(_Py_Identifier *id)
1847 {
1848 107451647 PyInterpreterState *interp = _PyInterpreterState_GET();
1849 107451647 struct _Py_unicode_ids *ids = &interp->unicode.ids;
1850
1851 107451647 Py_ssize_t index = _Py_atomic_size_get(&id->index);
1852
2/2
✓ Branch 0 taken 4374 times.
✓ Branch 1 taken 107447273 times.
107451647 if (index < 0) {
1853 4374 struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
1854
1855 4374 PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
1856 // Check again to detect concurrent access. Another thread can have
1857 // initialized the index while this thread waited for the lock.
1858 4374 index = _Py_atomic_size_get(&id->index);
1859
1/2
✓ Branch 0 taken 4374 times.
✗ Branch 1 not taken.
4374 if (index < 0) {
1860 assert(rt_ids->next_index < PY_SSIZE_T_MAX);
1861 4374 index = rt_ids->next_index;
1862 4374 rt_ids->next_index++;
1863 4374 _Py_atomic_size_set(&id->index, index);
1864 }
1865 4374 PyThread_release_lock(rt_ids->lock);
1866 }
1867 assert(index >= 0);
1868
1869 PyObject *obj;
1870
2/2
✓ Branch 0 taken 107451179 times.
✓ Branch 1 taken 468 times.
107451647 if (index < ids->size) {
1871 107451179 obj = ids->array[index];
1872
2/2
✓ Branch 0 taken 107447273 times.
✓ Branch 1 taken 3906 times.
107451179 if (obj) {
1873 // Return a borrowed reference
1874 107447273 return obj;
1875 }
1876 }
1877
1878 4374 obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
1879 NULL, NULL);
1880
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 4374 times.
4374 if (!obj) {
1881 return NULL;
1882 }
1883 4374 PyUnicode_InternInPlace(&obj);
1884
1885
2/2
✓ Branch 0 taken 468 times.
✓ Branch 1 taken 3906 times.
4374 if (index >= ids->size) {
1886 // Overallocate to reduce the number of realloc
1887 468 Py_ssize_t new_size = Py_MAX(index * 2, 16);
1888 468 Py_ssize_t item_size = sizeof(ids->array[0]);
1889 468 PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
1890
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 468 times.
468 if (new_array == NULL) {
1891 PyErr_NoMemory();
1892 return NULL;
1893 }
1894 468 memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
1895 468 ids->array = new_array;
1896 468 ids->size = new_size;
1897 }
1898
1899 // The array stores a strong reference
1900 4374 ids->array[index] = obj;
1901
1902 // Return a borrowed reference
1903 4374 return obj;
1904 }
1905
1906
1907 static void
1908 3404 unicode_clear_identifiers(struct _Py_unicode_state *state)
1909 {
1910 3404 struct _Py_unicode_ids *ids = &state->ids;
1911
2/2
✓ Branch 0 taken 7488 times.
✓ Branch 1 taken 3404 times.
10892 for (Py_ssize_t i=0; i < ids->size; i++) {
1912 7488 Py_XDECREF(ids->array[i]);
1913 }
1914 3404 ids->size = 0;
1915 3404 PyMem_Free(ids->array);
1916 3404 ids->array = NULL;
1917 // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
1918 // after Py_Finalize().
1919 3404 }
1920
1921
1922 /* Internal function, doesn't check maximum character */
1923
1924 PyObject*
1925 35604677 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1926 {
1927 35604677 const unsigned char *s = (const unsigned char *)buffer;
1928 PyObject *unicode;
1929
2/2
✓ Branch 0 taken 3495236 times.
✓ Branch 1 taken 32109441 times.
35604677 if (size == 1) {
1930 #ifdef Py_DEBUG
1931 assert((unsigned char)s[0] < 128);
1932 #endif
1933 3495236 return get_latin1_char(s[0]);
1934 }
1935 32109441 unicode = PyUnicode_New(size, 127);
1936
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 32109441 times.
32109441 if (!unicode)
1937 return NULL;
1938 32109441 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1939 assert(_PyUnicode_CheckConsistency(unicode, 1));
1940 32109441 return unicode;
1941 }
1942
1943 static Py_UCS4
1944 kind_maxchar_limit(int kind)
1945 {
1946 switch (kind) {
1947 case PyUnicode_1BYTE_KIND:
1948 return 0x80;
1949 case PyUnicode_2BYTE_KIND:
1950 return 0x100;
1951 case PyUnicode_4BYTE_KIND:
1952 return 0x10000;
1953 default:
1954 Py_UNREACHABLE();
1955 }
1956 }
1957
1958 static PyObject*
1959 82151850 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
1960 {
1961 PyObject *res;
1962 unsigned char max_char;
1963
1964
2/2
✓ Branch 0 taken 164958 times.
✓ Branch 1 taken 81986892 times.
82151850 if (size == 0) {
1965 164958 _Py_RETURN_UNICODE_EMPTY();
1966 }
1967 assert(size > 0);
1968
2/2
✓ Branch 0 taken 3075067 times.
✓ Branch 1 taken 78911825 times.
81986892 if (size == 1) {
1969 3075067 return get_latin1_char(u[0]);
1970 }
1971
1972 78911825 max_char = ucs1lib_find_max_char(u, u + size);
1973 78911825 res = PyUnicode_New(size, max_char);
1974
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 78911825 times.
78911825 if (!res)
1975 return NULL;
1976 78911825 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1977 assert(_PyUnicode_CheckConsistency(res, 1));
1978 78911825 return res;
1979 }
1980
1981 static PyObject*
1982 102123 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1983 {
1984 PyObject *res;
1985 Py_UCS2 max_char;
1986
1987
2/2
✓ Branch 0 taken 1347 times.
✓ Branch 1 taken 100776 times.
102123 if (size == 0)
1988 1347 _Py_RETURN_UNICODE_EMPTY();
1989 assert(size > 0);
1990
2/2
✓ Branch 0 taken 10531 times.
✓ Branch 1 taken 90245 times.
100776 if (size == 1)
1991 10531 return unicode_char(u[0]);
1992
1993 90245 max_char = ucs2lib_find_max_char(u, u + size);
1994 90245 res = PyUnicode_New(size, max_char);
1995
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 90245 times.
90245 if (!res)
1996 return NULL;
1997
2/2
✓ Branch 0 taken 23171 times.
✓ Branch 1 taken 67074 times.
90245 if (max_char >= 256)
1998 23171 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1999 else {
2000
4/4
✓ Branch 1 taken 644694 times.
✓ Branch 2 taken 67074 times.
✓ Branch 3 taken 99055 times.
✓ Branch 4 taken 67074 times.
810823 _PyUnicode_CONVERT_BYTES(
2001 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2002 }
2003 assert(_PyUnicode_CheckConsistency(res, 1));
2004 90245 return res;
2005 }
2006
2007 static PyObject*
2008 2111361 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2009 {
2010 PyObject *res;
2011 Py_UCS4 max_char;
2012
2013
2/2
✓ Branch 0 taken 4346 times.
✓ Branch 1 taken 2107015 times.
2111361 if (size == 0)
2014 4346 _Py_RETURN_UNICODE_EMPTY();
2015 assert(size > 0);
2016
2/2
✓ Branch 0 taken 511737 times.
✓ Branch 1 taken 1595278 times.
2107015 if (size == 1)
2017 511737 return unicode_char(u[0]);
2018
2019 1595278 max_char = ucs4lib_find_max_char(u, u + size);
2020 1595278 res = PyUnicode_New(size, max_char);
2021
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1595278 times.
1595278 if (!res)
2022 return NULL;
2023
2/2
✓ Branch 0 taken 1593063 times.
✓ Branch 1 taken 2215 times.
1595278 if (max_char < 256)
2024
4/4
✓ Branch 1 taken 24853382 times.
✓ Branch 2 taken 1593063 times.
✓ Branch 3 taken 2357504 times.
✓ Branch 4 taken 1593063 times.
28803949 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2025 PyUnicode_1BYTE_DATA(res));
2026
2/2
✓ Branch 0 taken 1634 times.
✓ Branch 1 taken 581 times.
2215 else if (max_char < 0x10000)
2027
4/4
✓ Branch 1 taken 287370 times.
✓ Branch 2 taken 1634 times.
✓ Branch 3 taken 1741 times.
✓ Branch 4 taken 1634 times.
290745 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2028 PyUnicode_2BYTE_DATA(res));
2029 else
2030 581 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2031 assert(_PyUnicode_CheckConsistency(res, 1));
2032 1595278 return res;
2033 }
2034
2035 PyObject*
2036 84275144 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2037 {
2038
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 84275144 times.
84275144 if (size < 0) {
2039 PyErr_SetString(PyExc_ValueError, "size must be positive");
2040 return NULL;
2041 }
2042
3/4
✓ Branch 0 taken 82100377 times.
✓ Branch 1 taken 66158 times.
✓ Branch 2 taken 2108609 times.
✗ Branch 3 not taken.
84275144 switch (kind) {
2043 82100377 case PyUnicode_1BYTE_KIND:
2044 82100377 return _PyUnicode_FromUCS1(buffer, size);
2045 66158 case PyUnicode_2BYTE_KIND:
2046 66158 return _PyUnicode_FromUCS2(buffer, size);
2047 2108609 case PyUnicode_4BYTE_KIND:
2048 2108609 return _PyUnicode_FromUCS4(buffer, size);
2049 default:
2050 PyErr_SetString(PyExc_SystemError, "invalid kind");
2051 return NULL;
2052 }
2053 }
2054
2055 Py_UCS4
2056 11012665 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2057 {
2058 int kind;
2059 const void *startptr, *endptr;
2060
2061 assert(0 <= start);
2062 assert(end <= PyUnicode_GET_LENGTH(unicode));
2063 assert(start <= end);
2064
2065
4/4
✓ Branch 0 taken 11012070 times.
✓ Branch 1 taken 595 times.
✓ Branch 3 taken 24880 times.
✓ Branch 4 taken 10987190 times.
11012665 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2066 24880 return PyUnicode_MAX_CHAR_VALUE(unicode);
2067
2068
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 10987785 times.
10987785 if (start == end)
2069 return 127;
2070
2071
1/2
✓ Branch 1 taken 10987785 times.
✗ Branch 2 not taken.
10987785 if (PyUnicode_IS_ASCII(unicode))
2072 10987785 return 127;
2073
2074 kind = PyUnicode_KIND(unicode);
2075 startptr = PyUnicode_DATA(unicode);
2076 endptr = (char *)startptr + end * kind;
2077 startptr = (char *)startptr + start * kind;
2078 switch(kind) {
2079 case PyUnicode_1BYTE_KIND:
2080 return ucs1lib_find_max_char(startptr, endptr);
2081 case PyUnicode_2BYTE_KIND:
2082 return ucs2lib_find_max_char(startptr, endptr);
2083 case PyUnicode_4BYTE_KIND:
2084 return ucs4lib_find_max_char(startptr, endptr);
2085 default:
2086 Py_UNREACHABLE();
2087 }
2088 }
2089
2090 /* Ensure that a string uses the most efficient storage, if it is not the
2091 case: create a new string with of the right kind. Write NULL into *p_unicode
2092 on error. */
2093 static void
2094 unicode_adjust_maxchar(PyObject **p_unicode)
2095 {
2096 PyObject *unicode, *copy;
2097 Py_UCS4 max_char;
2098 Py_ssize_t len;
2099 int kind;
2100
2101 assert(p_unicode != NULL);
2102 unicode = *p_unicode;
2103 if (PyUnicode_IS_ASCII(unicode))
2104 return;
2105
2106 len = PyUnicode_GET_LENGTH(unicode);
2107 kind = PyUnicode_KIND(unicode);
2108 if (kind == PyUnicode_1BYTE_KIND) {
2109 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2110 max_char = ucs1lib_find_max_char(u, u + len);
2111 if (max_char >= 128)
2112 return;
2113 }
2114 else if (kind == PyUnicode_2BYTE_KIND) {
2115 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2116 max_char = ucs2lib_find_max_char(u, u + len);
2117 if (max_char >= 256)
2118 return;
2119 }
2120 else if (kind == PyUnicode_4BYTE_KIND) {
2121 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2122 max_char = ucs4lib_find_max_char(u, u + len);
2123 if (max_char >= 0x10000)
2124 return;
2125 }
2126 else
2127 Py_UNREACHABLE();
2128
2129 copy = PyUnicode_New(len, max_char);
2130 if (copy != NULL)
2131 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2132 Py_DECREF(unicode);
2133 *p_unicode = copy;
2134 }
2135
2136 PyObject*
2137 1010755 _PyUnicode_Copy(PyObject *unicode)
2138 {
2139 Py_ssize_t length;
2140 PyObject *copy;
2141
2142
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 1010755 times.
1010755 if (!PyUnicode_Check(unicode)) {
2143 PyErr_BadInternalCall();
2144 return NULL;
2145 }
2146
2147 1010755 length = PyUnicode_GET_LENGTH(unicode);
2148 1010755 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2149
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1010755 times.
1010755 if (!copy)
2150 return NULL;
2151 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2152
2153 1010755 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2154 1010755 length * PyUnicode_KIND(unicode));
2155 assert(_PyUnicode_CheckConsistency(copy, 1));
2156 1010755 return copy;
2157 }
2158
2159
2160 /* Widen Unicode objects to larger buffers. Don't write terminating null
2161 character. Return NULL on error. */
2162
2163 static void*
2164 138 unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2165 {
2166 void *result;
2167
2168 assert(skind < kind);
2169
1/3
✗ Branch 0 not taken.
✓ Branch 1 taken 138 times.
✗ Branch 2 not taken.
138 switch (kind) {
2170 case PyUnicode_2BYTE_KIND:
2171 result = PyMem_New(Py_UCS2, len);
2172 if (!result)
2173 return PyErr_NoMemory();
2174 assert(skind == PyUnicode_1BYTE_KIND);
2175 _PyUnicode_CONVERT_BYTES(
2176 Py_UCS1, Py_UCS2,
2177 (const Py_UCS1 *)data,
2178 ((const Py_UCS1 *)data) + len,
2179 result);
2180 return result;
2181 138 case PyUnicode_4BYTE_KIND:
2182
1/2
✓ Branch 0 taken 138 times.
✗ Branch 1 not taken.
138 result = PyMem_New(Py_UCS4, len);
2183
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 138 times.
138 if (!result)
2184 return PyErr_NoMemory();
2185
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 138 times.
138 if (skind == PyUnicode_2BYTE_KIND) {
2186 _PyUnicode_CONVERT_BYTES(
2187 Py_UCS2, Py_UCS4,
2188 (const Py_UCS2 *)data,
2189 ((const Py_UCS2 *)data) + len,
2190 result);
2191 }
2192 else {
2193 assert(skind == PyUnicode_1BYTE_KIND);
2194
3/4
✗ Branch 0 not taken.
✓ Branch 1 taken 138 times.
✓ Branch 2 taken 138 times.
✓ Branch 3 taken 138 times.
276 _PyUnicode_CONVERT_BYTES(
2195 Py_UCS1, Py_UCS4,
2196 (const Py_UCS1 *)data,
2197 ((const Py_UCS1 *)data) + len,
2198 result);
2199 }
2200 138 return result;
2201 default:
2202 Py_UNREACHABLE();
2203 return NULL;
2204 }
2205 }
2206
2207 static Py_UCS4*
2208 188161 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2209 int copy_null)
2210 {
2211 int kind;
2212 const void *data;
2213 Py_ssize_t len, targetlen;
2214 188161 kind = PyUnicode_KIND(string);
2215 188161 data = PyUnicode_DATA(string);
2216 188161 len = PyUnicode_GET_LENGTH(string);
2217 188161 targetlen = len;
2218
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 188161 times.
188161 if (copy_null)
2219 targetlen++;
2220
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 188161 times.
188161 if (!target) {
2221 target = PyMem_New(Py_UCS4, targetlen);
2222 if (!target) {
2223 PyErr_NoMemory();
2224 return NULL;
2225 }
2226 }
2227 else {
2228
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 188161 times.
188161 if (targetsize < targetlen) {
2229 PyErr_Format(PyExc_SystemError,
2230 "string is longer than the buffer");
2231 if (copy_null && 0 < targetsize)
2232 target[0] = 0;
2233 return NULL;
2234 }
2235 }
2236
2/2
✓ Branch 0 taken 186348 times.
✓ Branch 1 taken 1813 times.
188161 if (kind == PyUnicode_1BYTE_KIND) {
2237 186348 const Py_UCS1 *start = (const Py_UCS1 *) data;
2238
4/4
✓ Branch 0 taken 22099170 times.
✓ Branch 1 taken 186348 times.
✓ Branch 2 taken 278659 times.
✓ Branch 3 taken 186348 times.
22564177 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2239 }
2240
2/2
✓ Branch 0 taken 1370 times.
✓ Branch 1 taken 443 times.
1813 else if (kind == PyUnicode_2BYTE_KIND) {
2241 1370 const Py_UCS2 *start = (const Py_UCS2 *) data;
2242
4/4
✓ Branch 0 taken 1318068 times.
✓ Branch 1 taken 1370 times.
✓ Branch 2 taken 1345 times.
✓ Branch 3 taken 1370 times.
1320783 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2243 }
2244
1/2
✓ Branch 0 taken 443 times.
✗ Branch 1 not taken.
443 else if (kind == PyUnicode_4BYTE_KIND) {
2245 443 memcpy(target, data, len * sizeof(Py_UCS4));
2246 }
2247 else {
2248 Py_UNREACHABLE();
2249 }
2250
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 188161 times.
188161 if (copy_null)
2251 target[len] = 0;
2252 188161 return target;
2253 }
2254
2255 Py_UCS4*
2256 188161 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2257 int copy_null)
2258 {
2259
2/4
✓ Branch 0 taken 188161 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 188161 times.
188161 if (target == NULL || targetsize < 0) {
2260 PyErr_BadInternalCall();
2261 return NULL;
2262 }
2263 188161 return as_ucs4(string, target, targetsize, copy_null);
2264 }
2265
2266 Py_UCS4*
2267 PyUnicode_AsUCS4Copy(PyObject *string)
2268 {
2269 return as_ucs4(string, NULL, 0, 1);
2270 }
2271
2272 /* maximum number of characters required for output of %lld or %p.
2273 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2274 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2275 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2276
2277 static int
2278 85491061 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2279 Py_ssize_t width, Py_ssize_t precision)
2280 {
2281 Py_ssize_t length, fill, arglen;
2282 Py_UCS4 maxchar;
2283
2284 85491061 length = PyUnicode_GET_LENGTH(str);
2285
4/4
✓ Branch 0 taken 9204 times.
✓ Branch 1 taken 85481857 times.
✓ Branch 2 taken 9184 times.
✓ Branch 3 taken 20 times.
85491061 if ((precision == -1 || precision >= length)
2286
1/2
✓ Branch 0 taken 85491041 times.
✗ Branch 1 not taken.
85491041 && width <= length)
2287 85491041 return _PyUnicodeWriter_WriteStr(writer, str);
2288
2289
1/2
✓ Branch 0 taken 20 times.
✗ Branch 1 not taken.
20 if (precision != -1)
2290 20 length = Py_MIN(precision, length);
2291
2292 20 arglen = Py_MAX(length, width);
2293
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 20 times.
20 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2294 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2295 else
2296 20 maxchar = writer->maxchar;
2297
2298
4/8
✓ Branch 0 taken 20 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 20 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 20 times.
✗ Branch 5 not taken.
✗ Branch 7 not taken.
✓ Branch 8 taken 20 times.
20 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2299 return -1;
2300
2301
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 20 times.
20 if (width > length) {
2302 fill = width - length;
2303 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2304 return -1;
2305 writer->pos += fill;
2306 }
2307
2308 20 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2309 str, 0, length);
2310 20 writer->pos += length;
2311 20 return 0;
2312 }
2313
2314 static int
2315 34201095 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2316 Py_ssize_t width, Py_ssize_t precision)
2317 {
2318 /* UTF-8 */
2319 Py_ssize_t length;
2320 PyObject *unicode;
2321 int res;
2322
2323
2/2
✓ Branch 0 taken 648967 times.
✓ Branch 1 taken 33552128 times.
34201095 if (precision == -1) {
2324 648967 length = strlen(str);
2325 }
2326 else {
2327 33552128 length = 0;
2328
3/4
✓ Branch 0 taken 313529235 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 279977107 times.
✓ Branch 3 taken 33552128 times.
313529235 while (length < precision && str[length]) {
2329 279977107 length++;
2330 }
2331 }
2332 34201095 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2333
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 34201095 times.
34201095 if (unicode == NULL)
2334 return -1;
2335
2336 34201095 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2337 34201095 Py_DECREF(unicode);
2338 34201095 return res;
2339 }
2340
2341 static const char*
2342 104392079 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2343 const char *f, va_list *vargs)
2344 {
2345 const char *p;
2346 Py_ssize_t len;
2347 int zeropad;
2348 Py_ssize_t width;
2349 Py_ssize_t precision;
2350 int longflag;
2351 int longlongflag;
2352 int size_tflag;
2353 Py_ssize_t fill;
2354
2355 104392079 p = f;
2356 104392079 f++;
2357 104392079 zeropad = 0;
2358
2/2
✓ Branch 0 taken 19656 times.
✓ Branch 1 taken 104372423 times.
104392079 if (*f == '0') {
2359 19656 zeropad = 1;
2360 19656 f++;
2361 }
2362
2363 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2364 104392079 width = -1;
2365
2/2
✓ Branch 0 taken 19656 times.
✓ Branch 1 taken 104372423 times.
104392079 if (Py_ISDIGIT((unsigned)*f)) {
2366 19656 width = *f - '0';
2367 19656 f++;
2368
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 19656 times.
19656 while (Py_ISDIGIT((unsigned)*f)) {
2369 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2370 PyErr_SetString(PyExc_ValueError,
2371 "width too big");
2372 return NULL;
2373 }
2374 width = (width * 10) + (*f - '0');
2375 f++;
2376 }
2377 }
2378 104392079 precision = -1;
2379
2/2
✓ Branch 0 taken 33561332 times.
✓ Branch 1 taken 70830747 times.
104392079 if (*f == '.') {
2380 33561332 f++;
2381
1/2
✓ Branch 0 taken 33561332 times.
✗ Branch 1 not taken.
33561332 if (Py_ISDIGIT((unsigned)*f)) {
2382 33561332 precision = (*f - '0');
2383 33561332 f++;
2384
2/2
✓ Branch 0 taken 35227996 times.
✓ Branch 1 taken 33561332 times.
68789328 while (Py_ISDIGIT((unsigned)*f)) {
2385
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 35227996 times.
35227996 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2386 PyErr_SetString(PyExc_ValueError,
2387 "precision too big");
2388 return NULL;
2389 }
2390 35227996 precision = (precision * 10) + (*f - '0');
2391 35227996 f++;
2392 }
2393 }
2394
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 33561332 times.
33561332 if (*f == '%') {
2395 /* "%.3%s" => f points to "3" */
2396 f--;
2397 }
2398 }
2399
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 104392079 times.
104392079 if (*f == '\0') {
2400 /* bogus format "%.123" => go backward, f points to "3" */
2401 f--;
2402 }
2403
2404 /* Handle %ld, %lu, %lld and %llu. */
2405 104392079 longflag = 0;
2406 104392079 longlongflag = 0;
2407 104392079 size_tflag = 0;
2408
2/2
✓ Branch 0 taken 18850104 times.
✓ Branch 1 taken 85541975 times.
104392079 if (*f == 'l') {
2409
2/6
✓ Branch 0 taken 18850104 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 18850104 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
18850104 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2410 18850104 longflag = 1;
2411 18850104 ++f;
2412 }
2413 else if (f[1] == 'l' &&
2414 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2415 longlongflag = 1;
2416 f += 2;
2417 }
2418 }
2419 /* handle the size_t flag. */
2420
1/8
✗ Branch 0 not taken.
✓ Branch 1 taken 85541975 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
85541975 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2421 size_tflag = 1;
2422 ++f;
2423 }
2424
2425
2/2
✓ Branch 0 taken 20569560 times.
✓ Branch 1 taken 83822519 times.
104392079 if (f[1] == '\0')
2426 20569560 writer->overallocate = 0;
2427
2428
8/11
✓ Branch 0 taken 3025 times.
✓ Branch 1 taken 18894985 times.
✓ Branch 2 taken 2966 times.
✓ Branch 3 taken 34201095 times.
✓ Branch 4 taken 51252705 times.
✓ Branch 5 taken 672 times.
✓ Branch 6 taken 2127 times.
✓ Branch 7 taken 34504 times.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
104392079 switch (*f) {
2429 3025 case 'c':
2430 {
2431 3025 int ordinal = va_arg(*vargs, int);
2432
2/4
✓ Branch 0 taken 3025 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3025 times.
3025 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2433 PyErr_SetString(PyExc_OverflowError,
2434 "character argument not in range(0x110000)");
2435 return NULL;
2436 }
2437
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3025 times.
3025 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2438 return NULL;
2439 3025 break;
2440 }
2441
2442 18894985 case 'i':
2443 case 'd':
2444 case 'u':
2445 case 'x':
2446 {
2447 /* used by sprintf */
2448 char buffer[MAX_LONG_LONG_CHARS];
2449 Py_ssize_t arglen;
2450
2451
2/2
✓ Branch 0 taken 18850104 times.
✓ Branch 1 taken 44881 times.
18894985 if (*f == 'u') {
2452
1/2
✓ Branch 0 taken 18850104 times.
✗ Branch 1 not taken.
18850104 if (longflag) {
2453 18850104 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2454 }
2455 else if (longlongflag) {
2456 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2457 }
2458 else if (size_tflag) {
2459 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2460 }
2461 else {
2462 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2463 }
2464 }
2465
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 44881 times.
44881 else if (*f == 'x') {
2466 len = sprintf(buffer, "%x", va_arg(*vargs, int));
2467 }
2468 else {
2469
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 44881 times.
44881 if (longflag) {
2470 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2471 }
2472
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 44881 times.
44881 else if (longlongflag) {
2473 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2474 }
2475
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 44881 times.
44881 else if (size_tflag) {
2476 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2477 }
2478 else {
2479 44881 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2480 }
2481 }
2482 assert(len >= 0);
2483
2484
1/2
✓ Branch 0 taken 18894985 times.
✗ Branch 1 not taken.
18894985 if (precision < len)
2485 18894985 precision = len;
2486
2487 18894985 arglen = Py_MAX(precision, width);
2488
5/8
✓ Branch 0 taken 18891961 times.
✓ Branch 1 taken 3024 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 18891961 times.
✓ Branch 4 taken 3024 times.
✗ Branch 5 not taken.
✗ Branch 7 not taken.
✓ Branch 8 taken 3024 times.
18894985 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2489 return NULL;
2490
2491
2/2
✓ Branch 0 taken 5154 times.
✓ Branch 1 taken 18889831 times.
18894985 if (width > precision) {
2492 Py_UCS4 fillchar;
2493 5154 fill = width - precision;
2494
1/2
✓ Branch 0 taken 5154 times.
✗ Branch 1 not taken.
5154 fillchar = zeropad?'0':' ';
2495
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 5154 times.
5154 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2496 return NULL;
2497 5154 writer->pos += fill;
2498 }
2499
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 18894985 times.
18894985 if (precision > len) {
2500 fill = precision - len;
2501 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2502 return NULL;
2503 writer->pos += fill;
2504 }
2505
2506
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 18894985 times.
18894985 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2507 return NULL;
2508 18894985 break;
2509 }
2510
2511 2966 case 'p':
2512 {
2513 char number[MAX_LONG_LONG_CHARS];
2514
2515 2966 len = sprintf(number, "%p", va_arg(*vargs, void*));
2516 assert(len >= 0);
2517
2518 /* %p is ill-defined: ensure leading 0x. */
2519
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2966 times.
2966 if (number[1] == 'X')
2520 number[1] = 'x';
2521
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2966 times.
2966 else if (number[1] != 'x') {
2522 memmove(number + 2, number,
2523 strlen(number) + 1);
2524 number[0] = '0';
2525 number[1] = 'x';
2526 len += 2;
2527 }
2528
2529
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 2966 times.
2966 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2530 return NULL;
2531 2966 break;
2532 }
2533
2534 34201095 case 's':
2535 {
2536 /* UTF-8 */
2537 34201095 const char *s = va_arg(*vargs, const char*);
2538
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 34201095 times.
34201095 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2539 return NULL;
2540 34201095 break;
2541 }
2542
2543 51252705 case 'U':
2544 {
2545 51252705 PyObject *obj = va_arg(*vargs, PyObject *);
2546 assert(obj && _PyUnicode_CHECK(obj));
2547
2548
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 51252705 times.
51252705 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2549 return NULL;
2550 51252705 break;
2551 }
2552
2553 672 case 'V':
2554 {
2555 672 PyObject *obj = va_arg(*vargs, PyObject *);
2556 672 const char *str = va_arg(*vargs, const char *);
2557
1/2
✓ Branch 0 taken 672 times.
✗ Branch 1 not taken.
672 if (obj) {
2558 assert(_PyUnicode_CHECK(obj));
2559
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 672 times.
672 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2560 return NULL;
2561 }
2562 else {
2563 assert(str != NULL);
2564 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2565 return NULL;
2566 }
2567 672 break;
2568 }
2569
2570 2127 case 'S':
2571 {
2572 2127 PyObject *obj = va_arg(*vargs, PyObject *);
2573 PyObject *str;
2574 assert(obj);
2575 2127 str = PyObject_Str(obj);
2576
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2127 times.
2127 if (!str)
2577 return NULL;
2578
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 2127 times.
2127 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2579 Py_DECREF(str);
2580 return NULL;
2581 }
2582 2127 Py_DECREF(str);
2583 2127 break;
2584 }
2585
2586 34504 case 'R':
2587 {
2588 34504 PyObject *obj = va_arg(*vargs, PyObject *);
2589 PyObject *repr;
2590 assert(obj);
2591 34504 repr = PyObject_Repr(obj);
2592
2/2
✓ Branch 0 taken 42 times.
✓ Branch 1 taken 34462 times.
34504 if (!repr)
2593 42 return NULL;
2594
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 34462 times.
34462 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2595 Py_DECREF(repr);
2596 return NULL;
2597 }
2598 34462 Py_DECREF(repr);
2599 34462 break;
2600 }
2601
2602 case 'A':
2603 {
2604 PyObject *obj = va_arg(*vargs, PyObject *);
2605 PyObject *ascii;
2606 assert(obj);
2607 ascii = PyObject_ASCII(obj);
2608 if (!ascii)
2609 return NULL;
2610 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2611 Py_DECREF(ascii);
2612 return NULL;
2613 }
2614 Py_DECREF(ascii);
2615 break;
2616 }
2617
2618 case '%':
2619 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2620 return NULL;
2621 break;
2622
2623 default:
2624 /* if we stumble upon an unknown formatting code, copy the rest
2625 of the format string to the output string. (we cannot just
2626 skip the code, since there's no way to know what's in the
2627 argument list) */
2628 len = strlen(p);
2629 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2630 return NULL;
2631 f = p+len;
2632 return f;
2633 }
2634
2635 104392037 f++;
2636 104392037 return f;
2637 }
2638
2639 PyObject *
2640 62175127 PyUnicode_FromFormatV(const char *format, va_list vargs)
2641 {
2642 va_list vargs2;
2643 const char *f;
2644 _PyUnicodeWriter writer;
2645
2646 62175127 _PyUnicodeWriter_Init(&writer);
2647 62175127 writer.min_length = strlen(format) + 100;
2648 62175127 writer.overallocate = 1;
2649
2650 // Copy varags to be able to pass a reference to a subfunction.
2651 62175127 va_copy(vargs2, vargs);
2652
2653
2/2
✓ Branch 0 taken 248675001 times.
✓ Branch 1 taken 62175085 times.
310850086 for (f = format; *f; ) {
2654
2/2
✓ Branch 0 taken 104392079 times.
✓ Branch 1 taken 144282922 times.
248675001 if (*f == '%') {
2655 104392079 f = unicode_fromformat_arg(&writer, f, &vargs2);
2656
2/2
✓ Branch 0 taken 42 times.
✓ Branch 1 taken 104392037 times.
104392079 if (f == NULL)
2657 42 goto fail;
2658 }
2659 else {
2660 const char *p;
2661 Py_ssize_t len;
2662
2663 144282922 p = f;
2664 do
2665 {
2666
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1423371698 times.
1423371698 if ((unsigned char)*p > 127) {
2667 PyErr_Format(PyExc_ValueError,
2668 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2669 "string, got a non-ASCII byte: 0x%02x",
2670 (unsigned char)*p);
2671 goto fail;
2672 }
2673 1423371698 p++;
2674 }
2675
4/4
✓ Branch 0 taken 1381766131 times.
✓ Branch 1 taken 41605567 times.
✓ Branch 2 taken 1279088776 times.
✓ Branch 3 taken 102677355 times.
1423371698 while (*p != '\0' && *p != '%');
2676 144282922 len = p - f;
2677
2678
2/2
✓ Branch 0 taken 41605567 times.
✓ Branch 1 taken 102677355 times.
144282922 if (*p == '\0')
2679 41605567 writer.overallocate = 0;
2680
2681
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 144282922 times.
144282922 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2682 goto fail;
2683
2684 144282922 f = p;
2685 }
2686 }
2687 62175085 va_end(vargs2);
2688 62175085 return _PyUnicodeWriter_Finish(&writer);
2689
2690 42 fail:
2691 42 va_end(vargs2);
2692 42 _PyUnicodeWriter_Dealloc(&writer);
2693 42 return NULL;
2694 }
2695
2696 PyObject *
2697 20567347 PyUnicode_FromFormat(const char *format, ...)
2698 {
2699 PyObject* ret;
2700 va_list vargs;
2701
2702 20567347 va_start(vargs, format);
2703 20567347 ret = PyUnicode_FromFormatV(format, vargs);
2704 20567347 va_end(vargs);
2705 20567347 return ret;
2706 }
2707
2708 static Py_ssize_t
2709 403125 unicode_get_widechar_size(PyObject *unicode)
2710 {
2711 Py_ssize_t res;
2712
2713 assert(unicode != NULL);
2714 assert(_PyUnicode_CHECK(unicode));
2715
2716 403125 res = _PyUnicode_LENGTH(unicode);
2717 #if SIZEOF_WCHAR_T == 2
2718 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
2719 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2720 const Py_UCS4 *end = s + res;
2721 for (; s < end; ++s) {
2722 if (*s > 0xFFFF) {
2723 ++res;
2724 }
2725 }
2726 }
2727 #endif
2728 403125 return res;
2729 }
2730
2731 static void
2732 403125 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
2733 {
2734 assert(unicode != NULL);
2735 assert(_PyUnicode_CHECK(unicode));
2736
2737
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 403125 times.
403125 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
2738 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
2739 return;
2740 }
2741
2742
1/2
✓ Branch 0 taken 403125 times.
✗ Branch 1 not taken.
403125 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
2743 403125 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
2744
2/2
✓ Branch 0 taken 23487503 times.
✓ Branch 1 taken 403125 times.
23890628 for (; size--; ++s, ++w) {
2745 23487503 *w = *s;
2746 }
2747 }
2748 else {
2749 #if SIZEOF_WCHAR_T == 4
2750 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
2751 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
2752 for (; size--; ++s, ++w) {
2753 *w = *s;
2754 }
2755 #else
2756 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2757 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2758 for (; size--; ++s, ++w) {
2759 Py_UCS4 ch = *s;
2760 if (ch > 0xFFFF) {
2761 assert(ch <= MAX_UNICODE);
2762 /* encode surrogate pair in this case */
2763 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
2764 if (!size--)
2765 break;
2766 *w = Py_UNICODE_LOW_SURROGATE(ch);
2767 }
2768 else {
2769 *w = ch;
2770 }
2771 }
2772 #endif
2773 }
2774 }
2775
2776 #ifdef HAVE_WCHAR_H
2777
2778 /* Convert a Unicode object to a wide character string.
2779
2780 - If w is NULL: return the number of wide characters (including the null
2781 character) required to convert the unicode object. Ignore size argument.
2782
2783 - Otherwise: return the number of wide characters (excluding the null
2784 character) written into w. Write at most size wide characters (including
2785 the null character). */
2786 Py_ssize_t
2787 PyUnicode_AsWideChar(PyObject *unicode,
2788 wchar_t *w,
2789 Py_ssize_t size)
2790 {
2791 Py_ssize_t res;
2792
2793 if (unicode == NULL) {
2794 PyErr_BadInternalCall();
2795 return -1;
2796 }
2797 if (!PyUnicode_Check(unicode)) {
2798 PyErr_BadArgument();
2799 return -1;
2800 }
2801
2802 res = unicode_get_widechar_size(unicode);
2803 if (w == NULL) {
2804 return res + 1;
2805 }
2806
2807 if (size > res) {
2808 size = res + 1;
2809 }
2810 else {
2811 res = size;
2812 }
2813 unicode_copy_as_widechar(unicode, w, size);
2814
2815 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2816 /* Oracle Solaris uses non-Unicode internal wchar_t form for
2817 non-Unicode locales and hence needs conversion first. */
2818 if (_Py_LocaleUsesNonUnicodeWchar()) {
2819 if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
2820 return -1;
2821 }
2822 }
2823 #endif
2824
2825 return res;
2826 }
2827
2828 wchar_t*
2829 403125 PyUnicode_AsWideCharString(PyObject *unicode,
2830 Py_ssize_t *size)
2831 {
2832 wchar_t *buffer;
2833 Py_ssize_t buflen;
2834
2835
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 403125 times.
403125 if (unicode == NULL) {
2836 PyErr_BadInternalCall();
2837 return NULL;
2838 }
2839
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 403125 times.
403125 if (!PyUnicode_Check(unicode)) {
2840 PyErr_BadArgument();
2841 return NULL;
2842 }
2843
2844 403125 buflen = unicode_get_widechar_size(unicode);
2845
1/2
✓ Branch 0 taken 403125 times.
✗ Branch 1 not taken.
403125 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
2846
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 403125 times.
403125 if (buffer == NULL) {
2847 PyErr_NoMemory();
2848 return NULL;
2849 }
2850 403125 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
2851
2852 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2853 /* Oracle Solaris uses non-Unicode internal wchar_t form for
2854 non-Unicode locales and hence needs conversion first. */
2855 if (_Py_LocaleUsesNonUnicodeWchar()) {
2856 if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
2857 return NULL;
2858 }
2859 }
2860 #endif
2861
2862
2/2
✓ Branch 0 taken 244933 times.
✓ Branch 1 taken 158192 times.
403125 if (size != NULL) {
2863 244933 *size = buflen;
2864 }
2865
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 158192 times.
158192 else if (wcslen(buffer) != (size_t)buflen) {
2866 PyMem_Free(buffer);
2867 PyErr_SetString(PyExc_ValueError,
2868 "embedded null character");
2869 return NULL;
2870 }
2871 403125 return buffer;
2872 }
2873
2874 #endif /* HAVE_WCHAR_H */
2875
2876 int
2877 _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
2878 {
2879 wchar_t **p = (wchar_t **)ptr;
2880 if (obj == NULL) {
2881 PyMem_Free(*p);
2882 *p = NULL;
2883 return 1;
2884 }
2885 if (PyUnicode_Check(obj)) {
2886 *p = PyUnicode_AsWideCharString(obj, NULL);
2887 if (*p == NULL) {
2888 return 0;
2889 }
2890 return Py_CLEANUP_SUPPORTED;
2891 }
2892 PyErr_Format(PyExc_TypeError,
2893 "argument must be str, not %.50s",
2894 Py_TYPE(obj)->tp_name);
2895 return 0;
2896 }
2897
2898 int
2899 _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
2900 {
2901 wchar_t **p = (wchar_t **)ptr;
2902 if (obj == NULL) {
2903 PyMem_Free(*p);
2904 *p = NULL;
2905 return 1;
2906 }
2907 if (obj == Py_None) {
2908 *p = NULL;
2909 return 1;
2910 }
2911 if (PyUnicode_Check(obj)) {
2912 *p = PyUnicode_AsWideCharString(obj, NULL);
2913 if (*p == NULL) {
2914 return 0;
2915 }
2916 return Py_CLEANUP_SUPPORTED;
2917 }
2918 PyErr_Format(PyExc_TypeError,
2919 "argument must be str or None, not %.50s",
2920 Py_TYPE(obj)->tp_name);
2921 return 0;
2922 }
2923
2924 PyObject *
2925 1244427 PyUnicode_FromOrdinal(int ordinal)
2926 {
2927
2/4
✓ Branch 0 taken 1244427 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1244427 times.
1244427 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2928 PyErr_SetString(PyExc_ValueError,
2929 "chr() arg not in range(0x110000)");
2930 return NULL;
2931 }
2932
2933 1244427 return unicode_char((Py_UCS4)ordinal);
2934 }
2935
2936 PyObject *
2937 1902242 PyUnicode_FromObject(PyObject *obj)
2938 {
2939 /* XXX Perhaps we should make this API an alias of
2940 PyObject_Str() instead ?! */
2941
2/2
✓ Branch 1 taken 1902095 times.
✓ Branch 2 taken 147 times.
1902242 if (PyUnicode_CheckExact(obj)) {
2942 1902095 Py_INCREF(obj);
2943 1902095 return obj;
2944 }
2945
1/2
✓ Branch 2 taken 147 times.
✗ Branch 3 not taken.
147 if (PyUnicode_Check(obj)) {
2946 /* For a Unicode subtype that's not a Unicode object,
2947 return a true Unicode object with the same data. */
2948 147 return _PyUnicode_Copy(obj);
2949 }
2950 PyErr_Format(PyExc_TypeError,
2951 "Can't convert '%.100s' object to str implicitly",
2952 Py_TYPE(obj)->tp_name);
2953 return NULL;
2954 }
2955
2956 PyObject *
2957 1789574 PyUnicode_FromEncodedObject(PyObject *obj,
2958 const char *encoding,
2959 const char *errors)
2960 {
2961 Py_buffer buffer;
2962 PyObject *v;
2963
2964
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1789574 times.
1789574 if (obj == NULL) {
2965 PyErr_BadInternalCall();
2966 return NULL;
2967 }
2968
2969 /* Decoding bytes objects is the most common case and should be fast */
2970
2/2
✓ Branch 2 taken 1785414 times.
✓ Branch 3 taken 4160 times.
1789574 if (PyBytes_Check(obj)) {
2971
2/2
✓ Branch 1 taken 24516 times.
✓ Branch 2 taken 1760898 times.
1785414 if (PyBytes_GET_SIZE(obj) == 0) {
2972
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 24516 times.
24516 if (unicode_check_encoding_errors(encoding, errors) < 0) {
2973 return NULL;
2974 }
2975 24516 _Py_RETURN_UNICODE_EMPTY();
2976 }
2977 3521796 return PyUnicode_Decode(
2978 1760898 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2979 encoding, errors);
2980 }
2981
2982
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 4160 times.
4160 if (PyUnicode_Check(obj)) {
2983 PyErr_SetString(PyExc_TypeError,
2984 "decoding str is not supported");
2985 return NULL;
2986 }
2987
2988 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2989
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 4160 times.
4160 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2990 PyErr_Format(PyExc_TypeError,
2991 "decoding to str: need a bytes-like object, %.80s found",
2992 Py_TYPE(obj)->tp_name);
2993 return NULL;
2994 }
2995
2996
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 4160 times.
4160 if (buffer.len == 0) {
2997 PyBuffer_Release(&buffer);
2998 if (unicode_check_encoding_errors(encoding, errors) < 0) {
2999 return NULL;
3000 }
3001 _Py_RETURN_UNICODE_EMPTY();
3002 }
3003
3004 4160 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3005 4160 PyBuffer_Release(&buffer);
3006 4160 return v;
3007 }
3008
3009 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3010 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3011 longer than lower_len-1). */
3012 int
3013 3884722 _Py_normalize_encoding(const char *encoding,
3014 char *lower,
3015 size_t lower_len)
3016 {
3017 const char *e;
3018 char *l;
3019 char *l_end;
3020 int punct;
3021
3022 assert(encoding != NULL);
3023
3024 3884722 e = encoding;
3025 3884722 l = lower;
3026 3884722 l_end = &lower[lower_len - 1];
3027 3884722 punct = 0;
3028 19550970 while (1) {
3029 23435692 char c = *e;
3030
2/2
✓ Branch 0 taken 3884722 times.
✓ Branch 1 taken 19550970 times.
23435692 if (c == 0) {
3031 3884722 break;
3032 }
3033
3034
4/4
✓ Branch 0 taken 1626719 times.
✓ Branch 1 taken 17924251 times.
✓ Branch 2 taken 320 times.
✓ Branch 3 taken 1626399 times.
19550970 if (Py_ISALNUM(c) || c == '.') {
3035
3/4
✓ Branch 0 taken 1626399 times.
✓ Branch 1 taken 16298172 times.
✓ Branch 2 taken 1626399 times.
✗ Branch 3 not taken.
17924571 if (punct && l != lower) {
3036
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1626399 times.
1626399 if (l == l_end) {
3037 return 0;
3038 }
3039 1626399 *l++ = '_';
3040 }
3041 17924571 punct = 0;
3042
3043
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 17924571 times.
17924571 if (l == l_end) {
3044 return 0;
3045 }
3046 17924571 *l++ = Py_TOLOWER(c);
3047 }
3048 else {
3049 1626399 punct = 1;
3050 }
3051
3052 19550970 e++;
3053 }
3054 3884722 *l = '\0';
3055 3884722 return 1;
3056 }
3057
3058 PyObject *
3059 1767783 PyUnicode_Decode(const char *s,
3060 Py_ssize_t size,
3061 const char *encoding,
3062 const char *errors)
3063 {
3064 1767783 PyObject *buffer = NULL, *unicode;
3065 Py_buffer info;
3066 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3067
3068
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 1767783 times.
1767783 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3069 return NULL;
3070 }
3071
3072
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1767783 times.
1767783 if (size == 0) {
3073 _Py_RETURN_UNICODE_EMPTY();
3074 }
3075
3076
2/2
✓ Branch 0 taken 16100 times.
✓ Branch 1 taken 1751683 times.
1767783 if (encoding == NULL) {
3077 16100 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3078 }
3079
3080 /* Shortcuts for common default encodings */
3081
1/2
✓ Branch 1 taken 1751683 times.
✗ Branch 2 not taken.
1751683 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3082 1751683 char *lower = buflower;
3083
3084 /* Fast paths */
3085
4/6
✓ Branch 0 taken 706532 times.
✓ Branch 1 taken 1045151 times.
✓ Branch 2 taken 706532 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 706532 times.
✗ Branch 5 not taken.
1751683 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3086 706532 lower += 3;
3087
2/2
✓ Branch 0 taken 705812 times.
✓ Branch 1 taken 720 times.
706532 if (*lower == '_') {
3088 /* Match "utf8" and "utf_8" */
3089 705812 lower++;
3090 }
3091
3092
2/4
✓ Branch 0 taken 706532 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 706532 times.
✗ Branch 3 not taken.
706532 if (lower[0] == '8' && lower[1] == 0) {
3093 706532 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3094 }
3095 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3096 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3097 }
3098 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3099 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3100 }
3101 }
3102 else {
3103
2/2
✓ Branch 0 taken 88707 times.
✓ Branch 1 taken 956444 times.
1045151 if (strcmp(lower, "ascii") == 0
3104
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 88707 times.
88707 || strcmp(lower, "us_ascii") == 0) {
3105 956444 return PyUnicode_DecodeASCII(s, size, errors);
3106 }
3107 #ifdef MS_WINDOWS
3108 else if (strcmp(lower, "mbcs") == 0) {
3109 return PyUnicode_DecodeMBCS(s, size, errors);
3110 }
3111 #endif
3112
2/2
✓ Branch 0 taken 40927 times.
✓ Branch 1 taken 47780 times.
88707 else if (strcmp(lower, "latin1") == 0
3113
2/2
✓ Branch 0 taken 39478 times.
✓ Branch 1 taken 1449 times.
40927 || strcmp(lower, "latin_1") == 0
3114
2/2
✓ Branch 0 taken 39426 times.
✓ Branch 1 taken 52 times.
39478 || strcmp(lower, "iso_8859_1") == 0
3115
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 39426 times.
39426 || strcmp(lower, "iso8859_1") == 0) {
3116 49281 return PyUnicode_DecodeLatin1(s, size, errors);
3117 }
3118 }
3119 }
3120
3121 /* Decode via the codec registry */
3122 39426 buffer = NULL;
3123
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 39426 times.
39426 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3124 goto onError;
3125 39426 buffer = PyMemoryView_FromBuffer(&info);
3126
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 39426 times.
39426 if (buffer == NULL)
3127 goto onError;
3128 39426 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3129
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 39426 times.
39426 if (unicode == NULL)
3130 goto onError;
3131
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 39426 times.
39426 if (!PyUnicode_Check(unicode)) {
3132 PyErr_Format(PyExc_TypeError,
3133 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3134 "use codecs.decode() to decode to arbitrary types",
3135 encoding,
3136 Py_TYPE(unicode)->tp_name);
3137 Py_DECREF(unicode);
3138 goto onError;
3139 }
3140 39426 Py_DECREF(buffer);
3141 39426 return unicode_result(unicode);
3142
3143 onError:
3144 Py_XDECREF(buffer);
3145 return NULL;
3146 }
3147
3148 PyObject *
3149 PyUnicode_AsDecodedObject(PyObject *unicode,
3150 const char *encoding,
3151 const char *errors)
3152 {
3153 if (!PyUnicode_Check(unicode)) {
3154 PyErr_BadArgument();
3155 return NULL;
3156 }
3157
3158 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3159 "PyUnicode_AsDecodedObject() is deprecated; "
3160 "use PyCodec_Decode() to decode from str", 1) < 0)
3161 return NULL;
3162
3163 if (encoding == NULL)
3164 encoding = PyUnicode_GetDefaultEncoding();
3165
3166 /* Decode via the codec registry */
3167 return PyCodec_Decode(unicode, encoding, errors);
3168 }
3169
3170 PyObject *
3171 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3172 const char *encoding,
3173 const char *errors)
3174 {
3175 PyObject *v;
3176
3177 if (!PyUnicode_Check(unicode)) {
3178 PyErr_BadArgument();
3179 goto onError;
3180 }
3181
3182 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3183 "PyUnicode_AsDecodedUnicode() is deprecated; "
3184 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3185 return NULL;
3186
3187 if (encoding == NULL)
3188 encoding = PyUnicode_GetDefaultEncoding();
3189
3190 /* Decode via the codec registry */
3191 v = PyCodec_Decode(unicode, encoding, errors);
3192 if (v == NULL)
3193 goto onError;
3194 if (!PyUnicode_Check(v)) {
3195 PyErr_Format(PyExc_TypeError,
3196 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3197 "use codecs.decode() to decode to arbitrary types",
3198 encoding,
3199 Py_TYPE(unicode)->tp_name);
3200 Py_DECREF(v);
3201 goto onError;
3202 }
3203 return unicode_result(v);
3204
3205 onError:
3206 return NULL;
3207 }
3208
3209 PyObject *
3210 PyUnicode_AsEncodedObject(PyObject *unicode,
3211 const char *encoding,
3212 const char *errors)
3213 {
3214 PyObject *v;
3215
3216 if (!PyUnicode_Check(unicode)) {
3217 PyErr_BadArgument();
3218 goto onError;
3219 }
3220
3221 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3222 "PyUnicode_AsEncodedObject() is deprecated; "
3223 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3224 "or PyCodec_Encode() for generic encoding", 1) < 0)
3225 return NULL;
3226
3227 if (encoding == NULL)
3228 encoding = PyUnicode_GetDefaultEncoding();
3229
3230 /* Encode via the codec registry */
3231 v = PyCodec_Encode(unicode, encoding, errors);
3232 if (v == NULL)
3233 goto onError;
3234 return v;
3235
3236 onError:
3237 return NULL;
3238 }
3239
3240
3241 static PyObject *
3242 81871 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3243 int current_locale)
3244 {
3245 Py_ssize_t wlen;
3246 81871 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3247
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 81871 times.
81871 if (wstr == NULL) {
3248 return NULL;
3249 }
3250
3251
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 81871 times.
81871 if ((size_t)wlen != wcslen(wstr)) {
3252 PyErr_SetString(PyExc_ValueError, "embedded null character");
3253 PyMem_Free(wstr);
3254 return NULL;
3255 }
3256
3257 char *str;
3258 size_t error_pos;
3259 const char *reason;
3260 81871 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3261 current_locale, error_handler);
3262 81871 PyMem_Free(wstr);
3263
3264
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 81871 times.
81871 if (res != 0) {
3265 if (res == -2) {
3266 PyObject *exc;
3267 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3268 "locale", unicode,
3269 (Py_ssize_t)error_pos,
3270 (Py_ssize_t)(error_pos+1),
3271 reason);
3272 if (exc != NULL) {
3273 PyCodec_StrictErrors(exc);
3274 Py_DECREF(exc);
3275 }
3276 }
3277 else if (res == -3) {
3278 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3279 }
3280 else {
3281 PyErr_NoMemory();
3282 }
3283 return NULL;
3284 }
3285
3286 81871 PyObject *bytes = PyBytes_FromString(str);
3287 81871 PyMem_RawFree(str);
3288 81871 return bytes;
3289 }
3290
3291 PyObject *
3292 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3293 {
3294 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3295 return unicode_encode_locale(unicode, error_handler, 1);
3296 }
3297
3298 PyObject *
3299 3469095 PyUnicode_EncodeFSDefault(PyObject *unicode)
3300 {
3301 3469095 PyInterpreterState *interp = _PyInterpreterState_GET();
3302 3469095 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3303
2/2
✓ Branch 0 taken 3384984 times.
✓ Branch 1 taken 84111 times.
3469095 if (fs_codec->utf8) {
3304 3384984 return unicode_encode_utf8(unicode,
3305 fs_codec->error_handler,
3306 3384984 fs_codec->errors);
3307 }
3308 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3309
2/2
✓ Branch 0 taken 2240 times.
✓ Branch 1 taken 81871 times.
84111 else if (fs_codec->encoding) {
3310 2240 return PyUnicode_AsEncodedString(unicode,
3311 2240 fs_codec->encoding,
3312 2240 fs_codec->errors);
3313 }
3314 #endif
3315 else {
3316 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3317 machinery is not ready and so cannot be used:
3318 use wcstombs() in this case. */
3319 81871 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3320 81871 const wchar_t *filesystem_errors = config->filesystem_errors;
3321 assert(filesystem_errors != NULL);
3322 81871 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3323 assert(errors != _Py_ERROR_UNKNOWN);
3324 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3325 return unicode_encode_utf8(unicode, errors, NULL);
3326 #else
3327 81871 return unicode_encode_locale(unicode, errors, 0);
3328 #endif
3329 }
3330 }
3331
3332 PyObject *
3333 2081359 PyUnicode_AsEncodedString(PyObject *unicode,
3334 const char *encoding,
3335 const char *errors)
3336 {
3337 PyObject *v;
3338 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
3339
3340
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 2081359 times.
2081359 if (!PyUnicode_Check(unicode)) {
3341 PyErr_BadArgument();
3342 return NULL;
3343 }
3344
3345
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 2081359 times.
2081359 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3346 return NULL;
3347 }
3348
3349
2/2
✓ Branch 0 taken 59159 times.
✓ Branch 1 taken 2022200 times.
2081359 if (encoding == NULL) {
3350 59159 return _PyUnicode_AsUTF8String(unicode, errors);
3351 }
3352
3353 /* Shortcuts for common default encodings */
3354
1/2
✓ Branch 1 taken 2022200 times.
✗ Branch 2 not taken.
2022200 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3355 2022200 char *lower = buflower;
3356
3357 /* Fast paths */
3358
5/6
✓ Branch 0 taken 882146 times.
✓ Branch 1 taken 1140054 times.
✓ Branch 2 taken 881842 times.
✓ Branch 3 taken 304 times.
✓ Branch 4 taken 881842 times.
✗ Branch 5 not taken.
2022200 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3359 881842 lower += 3;
3360
2/2
✓ Branch 0 taken 862629 times.
✓ Branch 1 taken 19213 times.
881842 if (*lower == '_') {
3361 /* Match "utf8" and "utf_8" */
3362 862629 lower++;
3363 }
3364
3365
4/4
✓ Branch 0 taken 881578 times.
✓ Branch 1 taken 264 times.
✓ Branch 2 taken 881534 times.
✓ Branch 3 taken 44 times.
881842 if (lower[0] == '8' && lower[1] == 0) {
3366 881534 return _PyUnicode_AsUTF8String(unicode, errors);
3367 }
3368
5/6
✓ Branch 0 taken 132 times.
✓ Branch 1 taken 176 times.
✓ Branch 2 taken 132 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 44 times.
✓ Branch 5 taken 88 times.
308 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3369 44 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3370 }
3371
5/6
✓ Branch 0 taken 132 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 132 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 44 times.
✓ Branch 5 taken 88 times.
264 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3372 44 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3373 }
3374 }
3375 else {
3376
2/2
✓ Branch 0 taken 126633 times.
✓ Branch 1 taken 1013725 times.
1140358 if (strcmp(lower, "ascii") == 0
3377
2/2
✓ Branch 0 taken 304 times.
✓ Branch 1 taken 126329 times.
126633 || strcmp(lower, "us_ascii") == 0) {
3378 1014029 return _PyUnicode_AsASCIIString(unicode, errors);
3379 }
3380 #ifdef MS_WINDOWS
3381 else if (strcmp(lower, "mbcs") == 0) {
3382 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3383 }
3384 #endif
3385
2/2
✓ Branch 0 taken 12907 times.
✓ Branch 1 taken 113422 times.
126329 else if (strcmp(lower, "latin1") == 0 ||
3386
2/2
✓ Branch 0 taken 12751 times.
✓ Branch 1 taken 156 times.
12907 strcmp(lower, "latin_1") == 0 ||
3387
1/2
✓ Branch 0 taken 12751 times.
✗ Branch 1 not taken.
12751 strcmp(lower, "iso_8859_1") == 0 ||
3388
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 12751 times.
12751 strcmp(lower, "iso8859_1") == 0) {
3389 113578 return _PyUnicode_AsLatin1String(unicode, errors);
3390 }
3391 }
3392 }
3393
3394 /* Encode via the codec registry */
3395 12971 v = _PyCodec_EncodeText(unicode, encoding, errors);
3396
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 12971 times.
12971 if (v == NULL)
3397 return NULL;
3398
3399 /* The normal path */
3400
1/2
✓ Branch 2 taken 12971 times.
✗ Branch 3 not taken.
12971 if (PyBytes_Check(v))
3401 12971 return v;
3402
3403 /* If the codec returns a buffer, raise a warning and convert to bytes */
3404 if (PyByteArray_Check(v)) {
3405 int error;
3406 PyObject *b;
3407
3408 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3409 "encoder %s returned bytearray instead of bytes; "
3410 "use codecs.encode() to encode to arbitrary types",
3411 encoding);
3412 if (error) {
3413 Py_DECREF(v);
3414 return NULL;
3415 }
3416
3417 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3418 PyByteArray_GET_SIZE(v));
3419 Py_DECREF(v);
3420 return b;
3421 }
3422
3423 PyErr_Format(PyExc_TypeError,
3424 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3425 "use codecs.encode() to encode to arbitrary types",
3426 encoding,
3427 Py_TYPE(v)->tp_name);
3428 Py_DECREF(v);
3429 return NULL;
3430 }
3431
3432 PyObject *
3433 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3434 const char *encoding,
3435 const char *errors)
3436 {
3437 PyObject *v;
3438
3439 if (!PyUnicode_Check(unicode)) {
3440 PyErr_BadArgument();
3441 goto onError;
3442 }
3443
3444 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3445 "PyUnicode_AsEncodedUnicode() is deprecated; "
3446 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3447 return NULL;
3448
3449 if (encoding == NULL)
3450 encoding = PyUnicode_GetDefaultEncoding();
3451
3452 /* Encode via the codec registry */
3453 v = PyCodec_Encode(unicode, encoding, errors);
3454 if (v == NULL)
3455 goto onError;
3456 if (!PyUnicode_Check(v)) {
3457 PyErr_Format(PyExc_TypeError,
3458 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3459 "use codecs.encode() to encode to arbitrary types",
3460 encoding,
3461 Py_TYPE(v)->tp_name);
3462 Py_DECREF(v);
3463 goto onError;
3464 }
3465 return v;
3466
3467 onError:
3468 return NULL;
3469 }
3470
3471 static PyObject*
3472 1615410 unicode_decode_locale(const char *str, Py_ssize_t len,
3473 _Py_error_handler errors, int current_locale)
3474 {
3475
2/4
✓ Branch 0 taken 1615410 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1615410 times.
1615410 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3476 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3477 return NULL;
3478 }
3479
3480 wchar_t *wstr;
3481 size_t wlen;
3482 const char *reason;
3483 1615410 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3484 current_locale, errors);
3485
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1615410 times.
1615410 if (res != 0) {
3486 if (res == -2) {
3487 PyObject *exc;
3488 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3489 "locale", str, len,
3490 (Py_ssize_t)wlen,
3491 (Py_ssize_t)(wlen + 1),
3492 reason);
3493 if (exc != NULL) {
3494 PyCodec_StrictErrors(exc);
3495 Py_DECREF(exc);
3496 }
3497 }
3498 else if (res == -3) {
3499 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3500 }
3501 else {
3502 PyErr_NoMemory();
3503 }
3504 return NULL;
3505 }
3506
3507 1615410 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3508 1615410 PyMem_RawFree(wstr);
3509 1615410 return unicode;
3510 }
3511
3512 PyObject*
3513 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3514 const char *errors)
3515 {
3516 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3517 return unicode_decode_locale(str, len, error_handler, 1);
3518 }
3519
3520 PyObject*
3521 501890 PyUnicode_DecodeLocale(const char *str, const char *errors)
3522 {
3523 501890 Py_ssize_t size = (Py_ssize_t)strlen(str);
3524 501890 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3525 501890 return unicode_decode_locale(str, size, error_handler, 1);
3526 }
3527
3528
3529 PyObject*
3530 368774 PyUnicode_DecodeFSDefault(const char *s) {
3531 368774 Py_ssize_t size = (Py_ssize_t)strlen(s);
3532 368774 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3533 }
3534
3535 PyObject*
3536 3968417 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3537 {
3538 3968417 PyInterpreterState *interp = _PyInterpreterState_GET();
3539 3968417 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3540
2/2
✓ Branch 0 taken 2852187 times.
✓ Branch 1 taken 1116230 times.
3968417 if (fs_codec->utf8) {
3541 2852187 return unicode_decode_utf8(s, size,
3542 fs_codec->error_handler,
3543 2852187 fs_codec->errors,
3544 NULL);
3545 }
3546 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3547
2/2
✓ Branch 0 taken 2710 times.
✓ Branch 1 taken 1113520 times.
1116230 else if (fs_codec->encoding) {
3548 2710 return PyUnicode_Decode(s, size,
3549 2710 fs_codec->encoding,
3550 2710 fs_codec->errors);
3551 }
3552 #endif
3553 else {
3554 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3555 machinery is not ready and so cannot be used:
3556 use mbstowcs() in this case. */
3557 1113520 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3558 1113520 const wchar_t *filesystem_errors = config->filesystem_errors;
3559 assert(filesystem_errors != NULL);
3560 1113520 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3561 assert(errors != _Py_ERROR_UNKNOWN);
3562 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3563 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3564 #else
3565 1113520 return unicode_decode_locale(s, size, errors, 0);
3566 #endif
3567 }
3568 }
3569
3570
3571 int
3572 3421226 PyUnicode_FSConverter(PyObject* arg, void* addr)
3573 {
3574 3421226 PyObject *path = NULL;
3575 3421226 PyObject *output = NULL;
3576 Py_ssize_t size;
3577 const char *data;
3578
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3421226 times.
3421226 if (arg == NULL) {
3579 Py_DECREF(*(PyObject**)addr);
3580 *(PyObject**)addr = NULL;
3581 return 1;
3582 }
3583 3421226 path = PyOS_FSPath(arg);
3584
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3421226 times.
3421226 if (path == NULL) {
3585 return 0;
3586 }
3587
2/2
✓ Branch 2 taken 452 times.
✓ Branch 3 taken 3420774 times.
3421226 if (PyBytes_Check(path)) {
3588 452 output = path;
3589 }
3590 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3591 3420774 output = PyUnicode_EncodeFSDefault(path);
3592 3420774 Py_DECREF(path);
3593
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3420774 times.
3420774 if (!output) {
3594 return 0;
3595 }
3596 assert(PyBytes_Check(output));
3597 }
3598
3599 3421226 size = PyBytes_GET_SIZE(output);
3600 3421226 data = PyBytes_AS_STRING(output);
3601
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3421226 times.
3421226 if ((size_t)size != strlen(data)) {
3602 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3603 Py_DECREF(output);
3604 return 0;
3605 }
3606 3421226 *(PyObject**)addr = output;
3607 3421226 return Py_CLEANUP_SUPPORTED;
3608 }
3609
3610
3611 int
3612 29231 PyUnicode_FSDecoder(PyObject* arg, void* addr)
3613 {
3614 29231 int is_buffer = 0;
3615 29231 PyObject *path = NULL;
3616 29231 PyObject *output = NULL;
3617
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 29231 times.
29231 if (arg == NULL) {
3618 Py_DECREF(*(PyObject**)addr);
3619 *(PyObject**)addr = NULL;
3620 return 1;
3621 }
3622
3623 29231 is_buffer = PyObject_CheckBuffer(arg);
3624
1/2
✓ Branch 0 taken 29231 times.
✗ Branch 1 not taken.
29231 if (!is_buffer) {
3625 29231 path = PyOS_FSPath(arg);
3626
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 29231 times.
29231 if (path == NULL) {
3627 return 0;
3628 }
3629 }
3630 else {
3631 path = arg;
3632 Py_INCREF(arg);
3633 }
3634
3635
1/2
✓ Branch 2 taken 29231 times.
✗ Branch 3 not taken.
29231 if (PyUnicode_Check(path)) {
3636 29231 output = path;
3637 }
3638 else if (PyBytes_Check(path) || is_buffer) {
3639 PyObject *path_bytes = NULL;
3640
3641 if (!PyBytes_Check(path) &&
3642 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3643 "path should be string, bytes, or os.PathLike, not %.200s",
3644 Py_TYPE(arg)->tp_name)) {
3645 Py_DECREF(path);
3646 return 0;
3647 }
3648 path_bytes = PyBytes_FromObject(path);
3649 Py_DECREF(path);
3650 if (!path_bytes) {
3651 return 0;
3652 }
3653 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3654 PyBytes_GET_SIZE(path_bytes));
3655 Py_DECREF(path_bytes);
3656 if (!output) {
3657 return 0;
3658 }
3659 }
3660 else {
3661 PyErr_Format(PyExc_TypeError,
3662 "path should be string, bytes, or os.PathLike, not %.200s",
3663 Py_TYPE(arg)->tp_name);
3664 Py_DECREF(path);
3665 return 0;
3666 }
3667
1/2
✗ Branch 3 not taken.
✓ Branch 4 taken 29231 times.
29231 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3668 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3669 PyErr_SetString(PyExc_ValueError, "embedded null character");
3670 Py_DECREF(output);
3671 return 0;
3672 }
3673 29231 *(PyObject**)addr = output;
3674 29231 return Py_CLEANUP_SUPPORTED;
3675 }
3676
3677
3678 static int unicode_fill_utf8(PyObject *unicode);
3679
3680 const char *
3681 22051080 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3682 {
3683
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 22051080 times.
22051080 if (!PyUnicode_Check(unicode)) {
3684 PyErr_BadArgument();
3685 return NULL;
3686 }
3687
3688
4/4
✓ Branch 1 taken 22049884 times.
✓ Branch 2 taken 1196 times.
✓ Branch 3 taken 1086 times.
✓ Branch 4 taken 22049994 times.
22051080 if (PyUnicode_UTF8(unicode) == NULL) {
3689
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 1086 times.
1086 if (unicode_fill_utf8(unicode) == -1) {
3690 return NULL;
3691 }
3692 }
3693
3694
2/2
✓ Branch 0 taken 19819291 times.
✓ Branch 1 taken 2231789 times.
22051080 if (psize)
3695
2/2
✓ Branch 1 taken 19819153 times.
✓ Branch 2 taken 138 times.
19819291 *psize = PyUnicode_UTF8_LENGTH(unicode);
3696
2/2
✓ Branch 1 taken 22049884 times.
✓ Branch 2 taken 1196 times.
22051080 return PyUnicode_UTF8(unicode);
3697 }
3698
3699 const char *
3700 2231789 PyUnicode_AsUTF8(PyObject *unicode)
3701 {
3702 2231789 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3703 }
3704
3705 /*
3706 PyUnicode_GetSize() has been deprecated since Python 3.3
3707 because it returned length of Py_UNICODE.
3708
3709 But this function is part of stable abi, because it don't
3710 include Py_UNICODE in signature and it was not excluded from
3711 stable abi in PEP 384.
3712 */
3713 PyAPI_FUNC(Py_ssize_t)
3714 PyUnicode_GetSize(PyObject *unicode)
3715 {
3716 PyErr_SetString(PyExc_RuntimeError,
3717 "PyUnicode_GetSize has been removed.");
3718 return -1;
3719 }
3720
3721 Py_ssize_t
3722 64864 PyUnicode_GetLength(PyObject *unicode)
3723 {
3724
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 64864 times.
64864 if (!PyUnicode_Check(unicode)) {
3725 PyErr_BadArgument();
3726 return -1;
3727 }
3728 64864 return PyUnicode_GET_LENGTH(unicode);
3729 }
3730
3731 Py_UCS4
3732 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3733 {
3734 const void *data;
3735 int kind;
3736
3737 if (!PyUnicode_Check(unicode)) {
3738 PyErr_BadArgument();
3739 return (Py_UCS4)-1;
3740 }
3741 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3742 PyErr_SetString(PyExc_IndexError, "string index out of range");
3743 return (Py_UCS4)-1;
3744 }
3745 data = PyUnicode_DATA(unicode);
3746 kind = PyUnicode_KIND(unicode);
3747 return PyUnicode_READ(kind, data, index);
3748 }
3749
3750 int
3751 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3752 {
3753 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3754 PyErr_BadArgument();
3755 return -1;
3756 }
3757 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3758 PyErr_SetString(PyExc_IndexError, "string index out of range");
3759 return -1;
3760 }
3761 if (unicode_check_modifiable(unicode))
3762 return -1;
3763 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3764 PyErr_SetString(PyExc_ValueError, "character out of range");
3765 return -1;
3766 }
3767 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3768 index, ch);
3769 return 0;
3770 }
3771
3772 const char *
3773 PyUnicode_GetDefaultEncoding(void)
3774 {
3775 return "utf-8";
3776 }
3777
3778 /* create or adjust a UnicodeDecodeError */
3779 static void
3780 make_decode_exception(PyObject **exceptionObject,
3781 const char *encoding,
3782 const char *input, Py_ssize_t length,
3783 Py_ssize_t startpos, Py_ssize_t endpos,
3784 const char *reason)
3785 {
3786 if (*exceptionObject == NULL) {
3787 *exceptionObject = PyUnicodeDecodeError_Create(
3788 encoding, input, length, startpos, endpos, reason);
3789 }
3790 else {
3791 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3792 goto onError;
3793 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3794 goto onError;
3795 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3796 goto onError;
3797 }
3798 return;
3799
3800 onError:
3801 Py_CLEAR(*exceptionObject);
3802 }
3803
3804 #ifdef MS_WINDOWS
3805 static int
3806 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
3807 {
3808 if (newsize > *size) {
3809 wchar_t *newbuf = *buf;
3810 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
3811 PyErr_NoMemory();
3812 return -1;
3813 }
3814 *buf = newbuf;
3815 }
3816 *size = newsize;
3817 return 0;
3818 }
3819
3820 /* error handling callback helper:
3821 build arguments, call the callback and check the arguments,
3822 if no exception occurred, copy the replacement to the output
3823 and adjust various state variables.
3824 return 0 on success, -1 on error
3825 */
3826
3827 static int
3828 unicode_decode_call_errorhandler_wchar(
3829 const char *errors, PyObject **errorHandler,
3830 const char *encoding, const char *reason,
3831 const char **input, const char **inend, Py_ssize_t *startinpos,
3832 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3833 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
3834 {
3835 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
3836
3837 PyObject *restuple = NULL;
3838 PyObject *repunicode = NULL;
3839 Py_ssize_t outsize;
3840 Py_ssize_t insize;
3841 Py_ssize_t requiredsize;
3842 Py_ssize_t newpos;
3843 PyObject *inputobj = NULL;
3844 Py_ssize_t repwlen;
3845
3846 if (*errorHandler == NULL) {
3847 *errorHandler = PyCodec_LookupError(errors);
3848 if (*errorHandler == NULL)
3849 goto onError;
3850 }
3851
3852 make_decode_exception(exceptionObject,
3853 encoding,
3854 *input, *inend - *input,
3855 *startinpos, *endinpos,
3856 reason);
3857 if (*exceptionObject == NULL)
3858 goto onError;
3859
3860 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
3861 if (restuple == NULL)
3862 goto onError;
3863 if (!PyTuple_Check(restuple)) {
3864 PyErr_SetString(PyExc_TypeError, &argparse[3]);
3865 goto onError;
3866 }
3867 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
3868 goto onError;
3869
3870 /* Copy back the bytes variables, which might have been modified by the
3871 callback */
3872 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3873 if (!inputobj)
3874 goto onError;
3875 *input = PyBytes_AS_STRING(inputobj);
3876 insize = PyBytes_GET_SIZE(inputobj);
3877 *inend = *input + insize;
3878 /* we can DECREF safely, as the exception has another reference,
3879 so the object won't go away. */
3880 Py_DECREF(inputobj);
3881
3882 if (newpos<0)
3883 newpos = insize+newpos;
3884 if (newpos<0 || newpos>insize) {
3885 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3886 goto onError;
3887 }
3888
3889 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
3890 if (repwlen < 0)
3891 goto onError;
3892 repwlen--;
3893 /* need more space? (at least enough for what we
3894 have+the replacement+the rest of the string (starting
3895 at the new input position), so we won't have to check space
3896 when there are no errors in the rest of the string) */
3897 requiredsize = *outpos;
3898 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
3899 goto overflow;
3900 requiredsize += repwlen;
3901 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
3902 goto overflow;
3903 requiredsize += insize - newpos;
3904 outsize = *bufsize;
3905 if (requiredsize > outsize) {
3906 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
3907 requiredsize = 2*outsize;
3908 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
3909 goto onError;
3910 }
3911 }
3912 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
3913 *outpos += repwlen;
3914 *endinpos = newpos;
3915 *inptr = *input + newpos;
3916
3917 /* we made it! */
3918 Py_DECREF(restuple);
3919 return 0;
3920
3921 overflow:
3922 PyErr_SetString(PyExc_OverflowError,
3923 "decoded result is too long for a Python string");
3924
3925 onError:
3926 Py_XDECREF(restuple);
3927 return -1;
3928 }
3929 #endif /* MS_WINDOWS */
3930
3931 static int
3932 unicode_decode_call_errorhandler_writer(
3933 const char *errors, PyObject **errorHandler,
3934 const char *encoding, const char *reason,
3935 const char **input, const char **inend, Py_ssize_t *startinpos,
3936 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3937 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
3938 {
3939 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
3940
3941 PyObject *restuple = NULL;
3942 PyObject *repunicode = NULL;
3943 Py_ssize_t insize;
3944 Py_ssize_t newpos;
3945 Py_ssize_t replen;
3946 Py_ssize_t remain;
3947 PyObject *inputobj = NULL;
3948 int need_to_grow = 0;
3949 const char *new_inptr;
3950
3951 if (*errorHandler == NULL) {
3952 *errorHandler = PyCodec_LookupError(errors);
3953 if (*errorHandler == NULL)
3954 goto onError;
3955 }
3956
3957 make_decode_exception(exceptionObject,
3958 encoding,
3959 *input, *inend - *input,
3960 *startinpos, *endinpos,
3961 reason);
3962 if (*exceptionObject == NULL)
3963 goto onError;
3964
3965 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
3966 if (restuple == NULL)
3967 goto onError;
3968 if (!PyTuple_Check(restuple)) {
3969 PyErr_SetString(PyExc_TypeError, &argparse[3]);
3970 goto onError;
3971 }
3972 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
3973 goto onError;
3974
3975 /* Copy back the bytes variables, which might have been modified by the
3976 callback */
3977 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3978 if (!inputobj)
3979 goto onError;
3980 remain = *inend - *input - *endinpos;
3981 *input = PyBytes_AS_STRING(inputobj);
3982 insize = PyBytes_GET_SIZE(inputobj);
3983 *inend = *input + insize;
3984 /* we can DECREF safely, as the exception has another reference,
3985 so the object won't go away. */
3986 Py_DECREF(inputobj);
3987
3988 if (newpos<0)
3989 newpos = insize+newpos;
3990 if (newpos<0 || newpos>insize) {
3991 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3992 goto onError;
3993 }
3994
3995 replen = PyUnicode_GET_LENGTH(repunicode);
3996 if (replen > 1) {
3997 writer->min_length += replen - 1;
3998 need_to_grow = 1;
3999 }
4000 new_inptr = *input + newpos;
4001 if (*inend - new_inptr > remain) {
4002 /* We don't know the decoding algorithm here so we make the worst
4003 assumption that one byte decodes to one unicode character.
4004 If unfortunately one byte could decode to more unicode characters,
4005 the decoder may write out-of-bound then. Is it possible for the
4006 algorithms using this function? */
4007 writer->min_length += *inend - new_inptr - remain;
4008 need_to_grow = 1;
4009 }
4010 if (need_to_grow) {
4011 writer->overallocate = 1;
4012 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4013 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4014 goto onError;
4015 }
4016 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4017 goto onError;
4018
4019 *endinpos = newpos;
4020 *inptr = new_inptr;
4021
4022 /* we made it! */
4023 Py_DECREF(restuple);
4024 return 0;
4025
4026 onError:
4027 Py_XDECREF(restuple);
4028 return -1;
4029 }
4030
4031 /* --- UTF-7 Codec -------------------------------------------------------- */
4032
4033 /* See RFC2152 for details. We encode conservatively and decode liberally. */
4034
4035 /* Three simple macros defining base-64. */
4036
4037 /* Is c a base-64 character? */
4038
4039 #define IS_BASE64(c) \
4040 (((c) >= 'A' && (c) <= 'Z') || \
4041 ((c) >= 'a' && (c) <= 'z') || \
4042 ((c) >= '0' && (c) <= '9') || \
4043 (c) == '+' || (c) == '/')
4044
4045 /* given that c is a base-64 character, what is its base-64 value? */
4046
4047 #define FROM_BASE64(c) \
4048 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4049 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4050 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4051 (c) == '+' ? 62 : 63)
4052
4053 /* What is the base-64 character of the bottom 6 bits of n? */
4054
4055 #define TO_BASE64(n) \
4056 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4057
4058 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4059 * decoded as itself. We are permissive on decoding; the only ASCII
4060 * byte not decoding to itself is the + which begins a base64
4061 * string. */
4062
4063 #define DECODE_DIRECT(c) \
4064 ((c) <= 127 && (c) != '+')
4065
4066 /* The UTF-7 encoder treats ASCII characters differently according to
4067 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4068 * the above). See RFC2152. This array identifies these different
4069 * sets:
4070 * 0 : "Set D"
4071 * alphanumeric and '(),-./:?
4072 * 1 : "Set O"
4073 * !"#$%&*;<=>@[]^_`{|}
4074 * 2 : "whitespace"
4075 * ht nl cr sp
4076 * 3 : special (must be base64 encoded)
4077 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4078 */
4079
4080 static
4081 char utf7_category[128] = {
4082 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4083 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4084 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4085 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4086 /* sp ! " # $ % & ' ( ) * + , - . / */
4087 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4088 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4089 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4090 /* @ A B C D E F G H I J K L M N O */
4091 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4092 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
4093 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4094 /* ` a b c d e f g h i j k l m n o */
4095 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4096 /* p q r s t u v w x y z { | } ~ del */
4097 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
4098 };
4099
4100 /* ENCODE_DIRECT: this character should be encoded as itself. The
4101 * answer depends on whether we are encoding set O as itself, and also
4102 * on whether we are encoding whitespace as itself. RFC2152 makes it
4103 * clear that the answers to these questions vary between
4104 * applications, so this code needs to be flexible. */
4105
4106 #define ENCODE_DIRECT(c, directO, directWS) \
4107 ((c) < 128 && (c) > 0 && \
4108 ((utf7_category[(c)] == 0) || \
4109 (directWS && (utf7_category[(c)] == 2)) || \
4110 (directO && (utf7_category[(c)] == 1))))
4111
4112 PyObject *
4113 PyUnicode_DecodeUTF7(const char *s,
4114 Py_ssize_t size,
4115 const char *errors)
4116 {
4117 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4118 }
4119
4120 /* The decoder. The only state we preserve is our read position,
4121 * i.e. how many characters we have consumed. So if we end in the
4122 * middle of a shift sequence we have to back off the read position
4123 * and the output to the beginning of the sequence, otherwise we lose
4124 * all the shift state (seen bits, number of bits seen, high
4125 * surrogate). */
4126
4127 PyObject *
4128 PyUnicode_DecodeUTF7Stateful(const char *s,
4129 Py_ssize_t size,
4130 const char *errors,
4131 Py_ssize_t *consumed)
4132 {
4133 const char *starts = s;
4134 Py_ssize_t startinpos;
4135 Py_ssize_t endinpos;
4136 const char *e;
4137 _PyUnicodeWriter writer;
4138 const char *errmsg = "";
4139 int inShift = 0;
4140 Py_ssize_t shiftOutStart;
4141 unsigned int base64bits = 0;
4142 unsigned long base64buffer = 0;
4143 Py_UCS4 surrogate = 0;
4144 PyObject *errorHandler = NULL;
4145 PyObject *exc = NULL;
4146
4147 if (size == 0) {
4148 if (consumed)
4149 *consumed = 0;
4150 _Py_RETURN_UNICODE_EMPTY();
4151 }
4152
4153 /* Start off assuming it's all ASCII. Widen later as necessary. */
4154 _PyUnicodeWriter_Init(&writer);
4155 writer.min_length = size;
4156
4157 shiftOutStart = 0;
4158 e = s + size;
4159
4160 while (s < e) {
4161 Py_UCS4 ch;
4162 restart:
4163 ch = (unsigned char) *s;
4164
4165 if (inShift) { /* in a base-64 section */
4166 if (IS_BASE64(ch)) { /* consume a base-64 character */
4167 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4168 base64bits += 6;
4169 s++;
4170 if (base64bits >= 16) {
4171 /* we have enough bits for a UTF-16 value */
4172 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4173 base64bits -= 16;
4174 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4175 assert(outCh <= 0xffff);
4176 if (surrogate) {
4177 /* expecting a second surrogate */
4178 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4179 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4180 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4181 goto onError;
4182 surrogate = 0;
4183 continue;
4184 }
4185 else {
4186 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4187 goto onError;
4188 surrogate = 0;
4189 }
4190 }
4191 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4192 /* first surrogate */
4193 surrogate = outCh;
4194 }
4195 else {
4196 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4197 goto onError;
4198 }
4199 }
4200 }
4201 else { /* now leaving a base-64 section */
4202 inShift = 0;
4203 if (base64bits > 0) { /* left-over bits */
4204 if (base64bits >= 6) {
4205 /* We've seen at least one base-64 character */
4206 s++;
4207 errmsg = "partial character in shift sequence";
4208 goto utf7Error;
4209 }
4210 else {
4211 /* Some bits remain; they should be zero */
4212 if (base64buffer != 0) {
4213 s++;
4214 errmsg = "non-zero padding bits in shift sequence";
4215 goto utf7Error;
4216 }
4217 }
4218 }
4219 if (surrogate && DECODE_DIRECT(ch)) {
4220 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4221 goto onError;
4222 }
4223 surrogate = 0;
4224 if (ch == '-') {
4225 /* '-' is absorbed; other terminating
4226 characters are preserved */
4227 s++;
4228 }
4229 }
4230 }
4231 else if ( ch == '+' ) {
4232 startinpos = s-starts;
4233 s++; /* consume '+' */
4234 if (s < e && *s == '-') { /* '+-' encodes '+' */
4235 s++;
4236 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4237 goto onError;
4238 }
4239 else if (s < e && !IS_BASE64(*s)) {
4240 s++;
4241 errmsg = "ill-formed sequence";
4242 goto utf7Error;
4243 }
4244 else { /* begin base64-encoded section */
4245 inShift = 1;
4246 surrogate = 0;
4247 shiftOutStart = writer.pos;
4248 base64bits = 0;
4249 base64buffer = 0;
4250 }
4251 }
4252 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4253 s++;
4254 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4255 goto onError;
4256 }
4257 else {
4258 startinpos = s-starts;
4259 s++;
4260 errmsg = "unexpected special character";
4261 goto utf7Error;
4262 }
4263 continue;
4264 utf7Error:
4265 endinpos = s-starts;
4266 if (unicode_decode_call_errorhandler_writer(
4267 errors, &errorHandler,
4268 "utf7", errmsg,
4269 &starts, &e, &startinpos, &endinpos, &exc, &s,
4270 &writer))
4271 goto onError;
4272 }
4273
4274 /* end of string */
4275
4276 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4277 /* if we're in an inconsistent state, that's an error */
4278 inShift = 0;
4279 if (surrogate ||
4280 (base64bits >= 6) ||
4281 (base64bits > 0 && base64buffer != 0)) {
4282 endinpos = size;
4283 if (unicode_decode_call_errorhandler_writer(
4284 errors, &errorHandler,
4285 "utf7", "unterminated shift sequence",
4286 &starts, &e, &startinpos, &endinpos, &exc, &s,
4287 &writer))
4288 goto onError;
4289 if (s < e)
4290 goto restart;
4291 }
4292 }
4293
4294 /* return state */
4295 if (consumed) {
4296 if (inShift) {
4297 *consumed = startinpos;
4298 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4299 PyObject *result = PyUnicode_FromKindAndData(
4300 writer.kind, writer.data, shiftOutStart);
4301 Py_XDECREF(errorHandler);
4302 Py_XDECREF(exc);
4303 _PyUnicodeWriter_Dealloc(&writer);
4304 return result;
4305 }
4306 writer.pos = shiftOutStart; /* back off output */
4307 }
4308 else {
4309 *consumed = s-starts;
4310 }
4311 }
4312
4313 Py_XDECREF(errorHandler);
4314 Py_XDECREF(exc);
4315 return _PyUnicodeWriter_Finish(&writer);
4316
4317 onError:
4318 Py_XDECREF(errorHandler);
4319 Py_XDECREF(exc);
4320 _PyUnicodeWriter_Dealloc(&writer);
4321 return NULL;
4322 }
4323
4324
4325 PyObject *
4326 _PyUnicode_EncodeUTF7(PyObject *str,
4327 int base64SetO,
4328 int base64WhiteSpace,
4329 const char *errors)
4330 {
4331 int kind;
4332 const void *data;
4333 Py_ssize_t len;
4334 PyObject *v;
4335 int inShift = 0;
4336 Py_ssize_t i;
4337 unsigned int base64bits = 0;
4338 unsigned long base64buffer = 0;
4339 char * out;
4340 const char * start;
4341
4342 kind = PyUnicode_KIND(str);
4343 data = PyUnicode_DATA(str);
4344 len = PyUnicode_GET_LENGTH(str);
4345
4346 if (len == 0)
4347 return PyBytes_FromStringAndSize(NULL, 0);
4348
4349 /* It might be possible to tighten this worst case */
4350 if (len > PY_SSIZE_T_MAX / 8)
4351 return PyErr_NoMemory();
4352 v = PyBytes_FromStringAndSize(NULL, len * 8);
4353 if (v == NULL)
4354 return NULL;
4355
4356 start = out = PyBytes_AS_STRING(v);
4357 for (i = 0; i < len; ++i) {
4358 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4359
4360 if (inShift) {
4361 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4362 /* shifting out */
4363 if (base64bits) { /* output remaining bits */
4364 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4365 base64buffer = 0;
4366 base64bits = 0;
4367 }
4368 inShift = 0;
4369 /* Characters not in the BASE64 set implicitly unshift the sequence
4370 so no '-' is required, except if the character is itself a '-' */
4371 if (IS_BASE64(ch) || ch == '-') {
4372 *out++ = '-';
4373 }
4374 *out++ = (char) ch;
4375 }
4376 else {
4377 goto encode_char;
4378 }
4379 }
4380 else { /* not in a shift sequence */
4381 if (ch == '+') {
4382 *out++ = '+';
4383 *out++ = '-';
4384 }
4385 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4386 *out++ = (char) ch;
4387 }
4388 else {
4389 *out++ = '+';
4390 inShift = 1;
4391 goto encode_char;
4392 }
4393 }
4394 continue;
4395 encode_char:
4396 if (ch >= 0x10000) {
4397 assert(ch <= MAX_UNICODE);
4398
4399 /* code first surrogate */
4400 base64bits += 16;
4401 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4402 while (base64bits >= 6) {
4403 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4404 base64bits -= 6;
4405 }
4406 /* prepare second surrogate */
4407 ch = Py_UNICODE_LOW_SURROGATE(ch);
4408 }
4409 base64bits += 16;
4410 base64buffer = (base64buffer << 16) | ch;
4411 while (base64bits >= 6) {
4412 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4413 base64bits -= 6;
4414 }
4415 }
4416 if (base64bits)
4417 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4418 if (inShift)
4419 *out++ = '-';
4420 if (_PyBytes_Resize(&v, out - start) < 0)
4421 return NULL;
4422 return v;
4423 }
4424
4425 #undef IS_BASE64
4426 #undef FROM_BASE64
4427 #undef TO_BASE64
4428 #undef DECODE_DIRECT
4429 #undef ENCODE_DIRECT
4430
4431 /* --- UTF-8 Codec -------------------------------------------------------- */
4432
4433 PyObject *
4434 65472974 PyUnicode_DecodeUTF8(const char *s,
4435 Py_ssize_t size,
4436 const char *errors)
4437 {
4438 65472974 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4439 }
4440
4441 #include "stringlib/asciilib.h"
4442 #include "stringlib/codecs.h"
4443 #include "stringlib/undef.h"
4444
4445 #include "stringlib/ucs1lib.h"
4446 #include "stringlib/codecs.h"
4447 #include "stringlib/undef.h"
4448
4449 #include "stringlib/ucs2lib.h"
4450 #include "stringlib/codecs.h"
4451 #include "stringlib/undef.h"
4452
4453 #include "stringlib/ucs4lib.h"
4454 #include "stringlib/codecs.h"
4455 #include "stringlib/undef.h"
4456
4457 /* Mask to quickly check whether a C 'size_t' contains a
4458 non-ASCII, UTF8-encoded char. */
4459 #if (SIZEOF_SIZE_T == 8)
4460 # define ASCII_CHAR_MASK 0x8080808080808080ULL
4461 #elif (SIZEOF_SIZE_T == 4)
4462 # define ASCII_CHAR_MASK 0x80808080U
4463 #else
4464 # error C 'size_t' size should be either 4 or 8!
4465 #endif
4466
4467 static Py_ssize_t
4468 145479307 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4469 {
4470 145479307 const char *p = start;
4471
4472 #if SIZEOF_SIZE_T <= SIZEOF_VOID_P
4473 assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
4474
2/2
✓ Branch 0 taken 59188335 times.
✓ Branch 1 taken 86290972 times.
145479307 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
4475 /* Fast path, see in STRINGLIB(utf8_decode) for
4476 an explanation. */
4477 /* Help allocation */
4478 59188335 const char *_p = p;
4479 59188335 Py_UCS1 * q = dest;
4480
2/2
✓ Branch 0 taken 109804947 times.
✓ Branch 1 taken 59139148 times.
168944095 while (_p + SIZEOF_SIZE_T <= end) {
4481 109804947 size_t value = *(const size_t *) _p;
4482
2/2
✓ Branch 0 taken 49187 times.
✓ Branch 1 taken 109755760 times.
109804947 if (value & ASCII_CHAR_MASK)
4483 49187 break;
4484 109755760 *((size_t *)q) = value;
4485 109755760 _p += SIZEOF_SIZE_T;
4486 109755760 q += SIZEOF_SIZE_T;
4487 }
4488 59188335 p = _p;
4489
2/2
✓ Branch 0 taken 181471463 times.
✓ Branch 1 taken 58972955 times.
240444418 while (p < end) {
4490
2/2
✓ Branch 0 taken 215380 times.
✓ Branch 1 taken 181256083 times.
181471463 if ((unsigned char)*p & 0x80)
4491 215380 break;
4492 181256083 *q++ = *p++;
4493 }
4494 59188335 return p - start;
4495 }
4496 #endif
4497
2/2
✓ Branch 0 taken 650438637 times.
✓ Branch 1 taken 79289239 times.
729727876 while (p < end) {
4498 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4499 for an explanation. */
4500
2/2
✓ Branch 0 taken 77399815 times.
✓ Branch 1 taken 573038822 times.
650438637 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
4501 /* Help allocation */
4502 77399815 const char *_p = p;
4503
2/2
✓ Branch 0 taken 27905602 times.
✓ Branch 1 taken 77385975 times.
105291577 while (_p + SIZEOF_SIZE_T <= end) {
4504 27905602 size_t value = *(const size_t *) _p;
4505
2/2
✓ Branch 0 taken 13840 times.
✓ Branch 1 taken 27891762 times.
27905602 if (value & ASCII_CHAR_MASK)
4506 13840 break;
4507 27891762 _p += SIZEOF_SIZE_T;
4508 }
4509 77399815 p = _p;
4510
2/2
✓ Branch 0 taken 5448550 times.
✓ Branch 1 taken 71951265 times.
77399815 if (_p == end)
4511 5448550 break;
4512 }
4513
2/2
✓ Branch 0 taken 1553183 times.
✓ Branch 1 taken 643436904 times.
644990087 if ((unsigned char)*p & 0x80)
4514 1553183 break;
4515 643436904 ++p;
4516 }
4517 86290972 memcpy(dest, start, p - start);
4518 86290972 return p - start;
4519 }
4520
4521 static PyObject *
4522 152430630 unicode_decode_utf8(const char *s, Py_ssize_t size,
4523 _Py_error_handler error_handler, const char *errors,
4524 Py_ssize_t *consumed)
4525 {
4526
2/2
✓ Branch 0 taken 447205 times.
✓ Branch 1 taken 151983425 times.
152430630 if (size == 0) {
4527
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 447205 times.
447205 if (consumed)
4528 *consumed = 0;
4529 447205 _Py_RETURN_UNICODE_EMPTY();
4530 }
4531
4532 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4533
3/4
✓ Branch 0 taken 7564307 times.
✓ Branch 1 taken 144419118 times.
✓ Branch 2 taken 7564307 times.
✗ Branch 3 not taken.
151983425 if (size == 1 && (unsigned char)s[0] < 128) {
4534
2/2
✓ Branch 0 taken 3024 times.
✓ Branch 1 taken 7561283 times.
7564307 if (consumed) {
4535 3024 *consumed = 1;
4536 }
4537 7564307 return get_latin1_char((unsigned char)s[0]);
4538 }
4539
4540 144419118 const char *starts = s;
4541 144419118 const char *end = s + size;
4542
4543 // fast path: try ASCII string.
4544 144419118 PyObject *u = PyUnicode_New(size, 127);
4545
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 144419118 times.
144419118 if (u == NULL) {
4546 return NULL;
4547 }
4548 144419118 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
4549
2/2
✓ Branch 0 taken 142650556 times.
✓ Branch 1 taken 1768562 times.
144419118 if (s == end) {
4550 142650556 return u;
4551 }
4552
4553 // Use _PyUnicodeWriter after fast path is failed.
4554 _PyUnicodeWriter writer;
4555 1768562 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4556 1768562 writer.pos = s - starts;
4557
4558 Py_ssize_t startinpos, endinpos;
4559 1768562 const char *errmsg = "";
4560 1768562 PyObject *error_handler_obj = NULL;
4561 1768562 PyObject *exc = NULL;
4562
4563
2/2
✓ Branch 0 taken 2271758 times.
✓ Branch 1 taken 1276574 times.
3548332 while (s < end) {
4564 Py_UCS4 ch;
4565 2271758 int kind = writer.kind;
4566
4567
2/2
✓ Branch 0 taken 1785501 times.
✓ Branch 1 taken 486257 times.
2271758 if (kind == PyUnicode_1BYTE_KIND) {
4568
2/2
✓ Branch 1 taken 1768562 times.
✓ Branch 2 taken 16939 times.
1785501 if (PyUnicode_IS_ASCII(writer.buffer))
4569 1768562 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4570 else
4571 16939 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4572
2/2
✓ Branch 0 taken 133874 times.
✓ Branch 1 taken 352383 times.
486257 } else if (kind == PyUnicode_2BYTE_KIND) {
4573 133874 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4574 } else {
4575 assert(kind == PyUnicode_4BYTE_KIND);
4576 352383 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4577 }
4578
4579
2/5
✓ Branch 0 taken 491988 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 1779770 times.
2271758 switch (ch) {
4580 491988 case 0:
4581
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 491988 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
491988 if (s == end || consumed)
4582 491988 goto End;
4583 errmsg = "unexpected end of data";
4584 startinpos = s - starts;
4585 endinpos = end - starts;
4586 break;
4587 case 1:
4588 errmsg = "invalid start byte";
4589 startinpos = s - starts;
4590 endinpos = startinpos + 1;
4591 break;
4592 case 2:
4593 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4594 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4595 {
4596 /* Truncated surrogate code in range D800-DFFF */
4597 goto End;
4598 }
4599 /* fall through */
4600 case 3:
4601 case 4:
4602 errmsg = "invalid continuation byte";
4603 startinpos = s - starts;
4604 endinpos = startinpos + ch - 1;
4605 break;
4606 1779770 default:
4607
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 1779770 times.
1779770 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4608 goto onError;
4609 1779770 continue;
4610 }
4611
4612 if (error_handler == _Py_ERROR_UNKNOWN)
4613 error_handler = _Py_GetErrorHandler(errors);
4614
4615 switch (error_handler) {
4616 case _Py_ERROR_IGNORE:
4617 s += (endinpos - startinpos);
4618 break;
4619
4620 case _Py_ERROR_REPLACE:
4621 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4622 goto onError;
4623 s += (endinpos - startinpos);
4624 break;
4625
4626 case _Py_ERROR_SURROGATEESCAPE:
4627 {
4628 Py_ssize_t i;
4629
4630 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4631 goto onError;
4632 for (i=startinpos; i<endinpos; i++) {
4633 ch = (Py_UCS4)(unsigned char)(starts[i]);
4634 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4635 ch + 0xdc00);
4636 writer.pos++;
4637 }
4638 s += (endinpos - startinpos);
4639 break;
4640 }
4641
4642 default:
4643 if (unicode_decode_call_errorhandler_writer(
4644 errors, &error_handler_obj,
4645 "utf-8", errmsg,
4646 &starts, &end, &startinpos, &endinpos, &exc, &s,
4647 &writer))
4648 goto onError;
4649 }
4650 }
4651
4652 1276574 End:
4653
2/2
✓ Branch 0 taken 404 times.
✓ Branch 1 taken 1768158 times.
1768562 if (consumed)
4654 404 *consumed = s - starts;
4655
4656 1768562 Py_XDECREF(error_handler_obj);
4657 1768562 Py_XDECREF(exc);
4658 1768562 return _PyUnicodeWriter_Finish(&writer);
4659
4660 onError:
4661 Py_XDECREF(error_handler_obj);
4662 Py_XDECREF(exc);
4663 _PyUnicodeWriter_Dealloc(&writer);
4664 return NULL;
4665 }
4666
4667
4668 PyObject *
4669 149578443 PyUnicode_DecodeUTF8Stateful(const char *s,
4670 Py_ssize_t size,
4671 const char *errors,
4672 Py_ssize_t *consumed)
4673 {
4674 149578443 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
4675 }
4676
4677
4678 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4679 non-zero, use strict error handler otherwise.
4680
4681 On success, write a pointer to a newly allocated wide character string into
4682 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4683 (in number of wchar_t units) into *wlen (if wlen is set).
4684
4685 On memory allocation failure, return -1.
4686
4687 On decoding error (if surrogateescape is zero), return -2. If wlen is
4688 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4689 is not NULL, write the decoding error message into *reason. */
4690 int
4691 86730 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
4692 const char **reason, _Py_error_handler errors)
4693 {
4694 86730 const char *orig_s = s;
4695 const char *e;
4696 wchar_t *unicode;
4697 Py_ssize_t outpos;
4698
4699 86730 int surrogateescape = 0;
4700 86730 int surrogatepass = 0;
4701
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 86730 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
86730 switch (errors)
4702 {
4703 case _Py_ERROR_STRICT:
4704 break;
4705 86730 case _Py_ERROR_SURROGATEESCAPE:
4706 86730 surrogateescape = 1;
4707 86730 break;
4708 case _Py_ERROR_SURROGATEPASS:
4709 surrogatepass = 1;
4710 break;
4711 default:
4712 return -3;
4713 }
4714
4715 /* Note: size will always be longer than the resulting Unicode
4716 character count */
4717
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 86730 times.
86730 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
4718 return -1;
4719 }
4720
4721 86730 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
4722
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 86730 times.
86730 if (!unicode) {
4723 return -1;
4724 }
4725
4726 /* Unpack UTF-8 encoded data */
4727 86730 e = s + size;
4728 86730 outpos = 0;
4729
1/2
✓ Branch 0 taken 86730 times.
✗ Branch 1 not taken.
86730 while (s < e) {
4730 Py_UCS4 ch;
4731 #if SIZEOF_WCHAR_T == 4
4732 86730 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4733 #else
4734 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
4735 #endif
4736
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 86730 times.
86730 if (ch > 0xFF) {
4737 #if SIZEOF_WCHAR_T == 4
4738 Py_UNREACHABLE();
4739 #else
4740 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
4741 /* write a surrogate pair */
4742 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4743 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4744 #endif
4745 }
4746 else {
4747
2/4
✓ Branch 0 taken 86730 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 86730 times.
✗ Branch 3 not taken.
86730 if (!ch && s == e) {
4748 86730 break;
4749 }
4750
4751 if (surrogateescape) {
4752 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4753 }
4754 else {
4755 /* Is it a valid three-byte code? */
4756 if (surrogatepass
4757 && (e - s) >= 3
4758 && (s[0] & 0xf0) == 0xe0
4759 && (s[1] & 0xc0) == 0x80
4760 && (s[2] & 0xc0) == 0x80)
4761 {
4762 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4763 s += 3;
4764 unicode[outpos++] = ch;
4765 }
4766 else {
4767 PyMem_RawFree(unicode );
4768 if (reason != NULL) {
4769 switch (ch) {
4770 case 0:
4771 *reason = "unexpected end of data";
4772 break;
4773 case 1:
4774 *reason = "invalid start byte";
4775 break;
4776 /* 2, 3, 4 */
4777 default:
4778 *reason = "invalid continuation byte";
4779 break;
4780 }
4781 }
4782 if (wlen != NULL) {
4783 *wlen = s - orig_s;
4784 }
4785 return -2;
4786 }
4787 }
4788 }
4789 }
4790 86730 unicode[outpos] = L'\0';
4791
1/2
✓ Branch 0 taken 86730 times.
✗ Branch 1 not taken.
86730 if (wlen) {
4792 86730 *wlen = outpos;
4793 }
4794 86730 *wstr = unicode;
4795 86730 return 0;
4796 }
4797
4798
4799 wchar_t*
4800 6761 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
4801 size_t *wlen)
4802 {
4803 wchar_t *wstr;
4804 6761 int res = _Py_DecodeUTF8Ex(arg, arglen,
4805 &wstr, wlen,
4806 NULL, _Py_ERROR_SURROGATEESCAPE);
4807
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6761 times.
6761 if (res != 0) {
4808 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
4809 assert(res != -3);
4810 if (wlen) {
4811 *wlen = (size_t)res;
4812 }
4813 return NULL;
4814 }
4815 6761 return wstr;
4816 }
4817
4818
4819 /* UTF-8 encoder using the surrogateescape error handler .
4820
4821 On success, return 0 and write the newly allocated character string (use
4822 PyMem_Free() to free the memory) into *str.
4823
4824 On encoding failure, return -2 and write the position of the invalid
4825 surrogate character into *error_pos (if error_pos is set) and the decoding
4826 error message into *reason (if reason is set).
4827
4828 On memory allocation failure, return -1. */
4829 int
4830 21071 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
4831 const char **reason, int raw_malloc, _Py_error_handler errors)
4832 {
4833 21071 const Py_ssize_t max_char_size = 4;
4834 21071 Py_ssize_t len = wcslen(text);
4835
4836 assert(len >= 0);
4837
4838 21071 int surrogateescape = 0;
4839 21071 int surrogatepass = 0;
4840
2/4
✓ Branch 0 taken 13616 times.
✓ Branch 1 taken 7455 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
21071 switch (errors)
4841 {
4842 13616 case _Py_ERROR_STRICT:
4843 13616 break;
4844 7455 case _Py_ERROR_SURROGATEESCAPE:
4845 7455 surrogateescape = 1;
4846 7455 break;
4847 case _Py_ERROR_SURROGATEPASS:
4848 surrogatepass = 1;
4849 break;
4850 default:
4851 return -3;
4852 }
4853
4854
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 21071 times.
21071 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
4855 return -1;
4856 }
4857 char *bytes;
4858
1/2
✓ Branch 0 taken 21071 times.
✗ Branch 1 not taken.
21071 if (raw_malloc) {
4859 21071 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
4860 }
4861 else {
4862 bytes = PyMem_Malloc((len + 1) * max_char_size);
4863 }
4864
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 21071 times.
21071 if (bytes == NULL) {
4865 return -1;
4866 }
4867
4868 21071 char *p = bytes;
4869 Py_ssize_t i;
4870
2/2
✓ Branch 0 taken 566648 times.
✓ Branch 1 taken 21071 times.
587719 for (i = 0; i < len; ) {
4871 566648 Py_ssize_t ch_pos = i;
4872 566648 Py_UCS4 ch = text[i];
4873 566648 i++;
4874 #if Py_UNICODE_SIZE == 2
4875 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
4876 && i < len
4877 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
4878 {
4879 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
4880 i++;
4881 }
4882 #endif
4883
4884
1/2
✓ Branch 0 taken 566648 times.
✗ Branch 1 not taken.
566648 if (ch < 0x80) {
4885 /* Encode ASCII */
4886 566648 *p++ = (char) ch;
4887
4888 }
4889 else if (ch < 0x0800) {
4890 /* Encode Latin-1 */
4891 *p++ = (char)(0xc0 | (ch >> 6));
4892 *p++ = (char)(0x80 | (ch & 0x3f));
4893 }
4894 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
4895 /* surrogateescape error handler */
4896 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
4897 if (error_pos != NULL) {
4898 *error_pos = (size_t)ch_pos;
4899 }
4900 if (reason != NULL) {
4901 *reason = "encoding error";
4902 }
4903 if (raw_malloc) {
4904 PyMem_RawFree(bytes);
4905 }
4906 else {
4907 PyMem_Free(bytes);
4908 }
4909 return -2;
4910 }
4911 *p++ = (char)(ch & 0xff);
4912 }
4913 else if (ch < 0x10000) {
4914 *p++ = (char)(0xe0 | (ch >> 12));
4915 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4916 *p++ = (char)(0x80 | (ch & 0x3f));
4917 }
4918 else { /* ch >= 0x10000 */
4919 assert(ch <= MAX_UNICODE);
4920 /* Encode UCS4 Unicode ordinals */
4921 *p++ = (char)(0xf0 | (ch >> 18));
4922 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4923 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4924 *p++ = (char)(0x80 | (ch & 0x3f));
4925 }
4926 }
4927 21071 *p++ = '\0';
4928
4929 21071 size_t final_size = (p - bytes);
4930 char *bytes2;
4931
1/2
✓ Branch 0 taken 21071 times.
✗ Branch 1 not taken.
21071 if (raw_malloc) {
4932 21071 bytes2 = PyMem_RawRealloc(bytes, final_size);
4933 }
4934 else {
4935 bytes2 = PyMem_Realloc(bytes, final_size);
4936 }
4937
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 21071 times.
21071 if (bytes2 == NULL) {
4938 if (error_pos != NULL) {
4939 *error_pos = (size_t)-1;
4940 }
4941 if (raw_malloc) {
4942 PyMem_RawFree(bytes);
4943 }
4944 else {
4945 PyMem_Free(bytes);
4946 }
4947 return -1;
4948 }
4949 21071 *str = bytes2;
4950 21071 return 0;
4951 }
4952
4953
4954 /* Primary internal function which creates utf8 encoded bytes objects.
4955
4956 Allocation strategy: if the string is short, convert into a stack buffer
4957 and allocate exactly as much space needed at the end. Else allocate the
4958 maximum possible needed (4 result bytes per Unicode character), and return
4959 the excess memory at the end.
4960 */
4961 static PyObject *
4962 4326675 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
4963 const char *errors)
4964 {
4965
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 4326675 times.
4326675 if (!PyUnicode_Check(unicode)) {
4966 PyErr_BadArgument();
4967 return NULL;
4968 }
4969
4970
4/4
✓ Branch 1 taken 4305494 times.
✓ Branch 2 taken 21181 times.
✓ Branch 3 taken 4305494 times.
✓ Branch 4 taken 21181 times.
4326675 if (PyUnicode_UTF8(unicode))
4971
2/4
✓ Branch 0 taken 4305494 times.
✗ Branch 1 not taken.
✓ Branch 3 taken 4305494 times.
✗ Branch 4 not taken.
4305494 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4972 4305494 PyUnicode_UTF8_LENGTH(unicode));
4973
4974 21181 int kind = PyUnicode_KIND(unicode);
4975 21181 const void *data = PyUnicode_DATA(unicode);
4976 21181 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
4977
4978 _PyBytesWriter writer;
4979 char *end;
4980
4981
3/4
✗ Branch 0 not taken.
✓ Branch 1 taken 887 times.
✓ Branch 2 taken 16686 times.
✓ Branch 3 taken 3608 times.
21181 switch (kind) {
4982 default:
4983 Py_UNREACHABLE();
4984 887 case PyUnicode_1BYTE_KIND:
4985 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4986 assert(!PyUnicode_IS_ASCII(unicode));
4987 887 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
4988 887 break;
4989 16686 case PyUnicode_2BYTE_KIND:
4990 16686 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
4991 16686 break;
4992 3608 case PyUnicode_4BYTE_KIND:
4993 3608 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
4994 3608 break;
4995 }
4996
4997
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 21180 times.
21181 if (end == NULL) {
4998 1 _PyBytesWriter_Dealloc(&writer);
4999 1 return NULL;
5000 }
5001 21180 return _PyBytesWriter_Finish(&writer, end);
5002 }
5003
5004 static int
5005 1086 unicode_fill_utf8(PyObject *unicode)
5006 {
5007 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5008 assert(!PyUnicode_IS_ASCII(unicode));
5009
5010 1086 int kind = PyUnicode_KIND(unicode);
5011 1086 const void *data = PyUnicode_DATA(unicode);
5012 1086 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5013
5014 _PyBytesWriter writer;
5015 char *end;
5016
5017
2/4
✗ Branch 0 not taken.
✓ Branch 1 taken 259 times.
✓ Branch 2 taken 827 times.
✗ Branch 3 not taken.
1086 switch (kind) {
5018 default:
5019 Py_UNREACHABLE();
5020 259 case PyUnicode_1BYTE_KIND:
5021 259 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5022 _Py_ERROR_STRICT, NULL);
5023 259 break;
5024 827 case PyUnicode_2BYTE_KIND:
5025 827 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5026 _Py_ERROR_STRICT, NULL);
5027 827 break;
5028 case PyUnicode_4BYTE_KIND:
5029 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5030 _Py_ERROR_STRICT, NULL);
5031 break;
5032 }
5033
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1086 times.
1086 if (end == NULL) {
5034 _PyBytesWriter_Dealloc(&writer);
5035 return -1;
5036 }
5037
5038
2/2
✓ Branch 0 taken 22 times.
✓ Branch 1 taken 1064 times.
1086 const char *start = writer.use_small_buffer ? writer.small_buffer :
5039 1064 PyBytes_AS_STRING(writer.buffer);
5040 1086 Py_ssize_t len = end - start;
5041
5042 1086 char *cache = PyObject_Malloc(len + 1);
5043
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1086 times.
1086 if (cache == NULL) {
5044 _PyBytesWriter_Dealloc(&writer);
5045 PyErr_NoMemory();
5046 return -1;
5047 }
5048 1086 _PyUnicode_UTF8(unicode) = cache;
5049 1086 _PyUnicode_UTF8_LENGTH(unicode) = len;
5050 1086 memcpy(cache, start, len);
5051 1086 cache[len] = '\0';
5052 1086 _PyBytesWriter_Dealloc(&writer);
5053 1086 return 0;
5054 }
5055
5056 PyObject *
5057 941691 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5058 {
5059 941691 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5060 }
5061
5062
5063 PyObject *
5064 944 PyUnicode_AsUTF8String(PyObject *unicode)
5065 {
5066 944 return _PyUnicode_AsUTF8String(unicode, NULL);
5067 }
5068
5069 /* --- UTF-32 Codec ------------------------------------------------------- */
5070
5071 PyObject *
5072 PyUnicode_DecodeUTF32(const char *s,
5073 Py_ssize_t size,
5074 const char *errors,
5075 int *byteorder)
5076 {
5077 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5078 }
5079
5080 PyObject *
5081 PyUnicode_DecodeUTF32Stateful(const char *s,
5082 Py_ssize_t size,
5083 const char *errors,
5084 int *byteorder,
5085 Py_ssize_t *consumed)
5086 {
5087 const char *starts = s;
5088 Py_ssize_t startinpos;
5089 Py_ssize_t endinpos;
5090 _PyUnicodeWriter writer;
5091 const unsigned char *q, *e;
5092 int le, bo = 0; /* assume native ordering by default */
5093 const char *encoding;
5094 const char *errmsg = "";
5095 PyObject *errorHandler = NULL;
5096 PyObject *exc = NULL;
5097
5098 q = (const unsigned char *)s;
5099 e = q + size;
5100
5101 if (byteorder)
5102 bo = *byteorder;
5103
5104 /* Check for BOM marks (U+FEFF) in the input and adjust current
5105 byte order setting accordingly. In native mode, the leading BOM
5106 mark is skipped, in all other modes, it is copied to the output
5107 stream as-is (giving a ZWNBSP character). */
5108 if (bo == 0 && size >= 4) {
5109 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5110 if (bom == 0x0000FEFF) {
5111 bo = -1;
5112 q += 4;
5113 }
5114 else if (bom == 0xFFFE0000) {
5115 bo = 1;
5116 q += 4;
5117 }
5118 if (byteorder)
5119 *byteorder = bo;
5120 }
5121
5122 if (q == e) {
5123 if (consumed)
5124 *consumed = size;
5125 _Py_RETURN_UNICODE_EMPTY();
5126 }
5127
5128 #ifdef WORDS_BIGENDIAN
5129 le = bo < 0;
5130 #else
5131 le = bo <= 0;
5132 #endif
5133 encoding = le ? "utf-32-le" : "utf-32-be";
5134
5135 _PyUnicodeWriter_Init(&writer);
5136 writer.min_length = (e - q + 3) / 4;
5137 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5138 goto onError;
5139
5140 while (1) {
5141 Py_UCS4 ch = 0;
5142 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5143
5144 if (e - q >= 4) {
5145 int kind = writer.kind;
5146 void *data = writer.data;
5147 const unsigned char *last = e - 4;
5148 Py_ssize_t pos = writer.pos;
5149 if (le) {
5150 do {
5151 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5152 if (ch > maxch)
5153 break;
5154 if (kind != PyUnicode_1BYTE_KIND &&
5155 Py_UNICODE_IS_SURROGATE(ch))
5156 break;
5157 PyUnicode_WRITE(kind, data, pos++, ch);
5158 q += 4;
5159 } while (q <= last);
5160 }
5161 else {
5162 do {
5163 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5164 if (ch > maxch)
5165 break;
5166 if (kind != PyUnicode_1BYTE_KIND &&
5167 Py_UNICODE_IS_SURROGATE(ch))
5168 break;
5169 PyUnicode_WRITE(kind, data, pos++, ch);
5170 q += 4;
5171 } while (q <= last);
5172 }
5173 writer.pos = pos;
5174 }
5175
5176 if (Py_UNICODE_IS_SURROGATE(ch)) {
5177 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5178 startinpos = ((const char *)q) - starts;
5179 endinpos = startinpos + 4;
5180 }
5181 else if (ch <= maxch) {
5182 if (q == e || consumed)
5183 break;
5184 /* remaining bytes at the end? (size should be divisible by 4) */
5185 errmsg = "truncated data";
5186 startinpos = ((const char *)q) - starts;
5187 endinpos = ((const char *)e) - starts;
5188 }
5189 else {
5190 if (ch < 0x110000) {
5191 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5192 goto onError;
5193 q += 4;
5194 continue;
5195 }
5196 errmsg = "code point not in range(0x110000)";
5197 startinpos = ((const char *)q) - starts;
5198 endinpos = startinpos + 4;
5199 }
5200
5201 /* The remaining input chars are ignored if the callback
5202 chooses to skip the input */
5203 if (unicode_decode_call_errorhandler_writer(
5204 errors, &errorHandler,
5205 encoding, errmsg,
5206 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5207 &writer))
5208 goto onError;
5209 }
5210
5211 if (consumed)
5212 *consumed = (const char *)q-starts;
5213
5214 Py_XDECREF(errorHandler);
5215 Py_XDECREF(exc);
5216 return _PyUnicodeWriter_Finish(&writer);
5217
5218 onError:
5219 _PyUnicodeWriter_Dealloc(&writer);
5220 Py_XDECREF(errorHandler);
5221 Py_XDECREF(exc);
5222 return NULL;
5223 }
5224
5225 PyObject *
5226 132 _PyUnicode_EncodeUTF32(PyObject *str,
5227 const char *errors,
5228 int byteorder)
5229 {
5230 int kind;
5231 const void *data;
5232 Py_ssize_t len;
5233 PyObject *v;
5234 uint32_t *out;
5235 #if PY_LITTLE_ENDIAN
5236 132 int native_ordering = byteorder <= 0;
5237 #else
5238 int native_ordering = byteorder >= 0;
5239 #endif
5240 const char *encoding;
5241 Py_ssize_t nsize, pos;
5242 132 PyObject *errorHandler = NULL;
5243 132 PyObject *exc = NULL;
5244 132 PyObject *rep = NULL;
5245
5246
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 132 times.
132 if (!PyUnicode_Check(str)) {
5247 PyErr_BadArgument();
5248 return NULL;
5249 }
5250 132 kind = PyUnicode_KIND(str);
5251 132 data = PyUnicode_DATA(str);
5252 132 len = PyUnicode_GET_LENGTH(str);
5253
5254
3/4
✓ Branch 0 taken 44 times.
✓ Branch 1 taken 88 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 132 times.
132 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5255 return PyErr_NoMemory();
5256 132 nsize = len + (byteorder == 0);
5257 132 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5258
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 132 times.
132 if (v == NULL)
5259 return NULL;
5260
5261 /* output buffer is 4-bytes aligned */
5262 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5263 132 out = (uint32_t *)PyBytes_AS_STRING(v);
5264
2/2
✓ Branch 0 taken 44 times.
✓ Branch 1 taken 88 times.
132 if (byteorder == 0)
5265 44 *out++ = 0xFEFF;
5266
2/2
✓ Branch 0 taken 66 times.
✓ Branch 1 taken 66 times.
132 if (len == 0)
5267 66 goto done;
5268
5269
2/2
✓ Branch 0 taken 22 times.
✓ Branch 1 taken 44 times.
66 if (byteorder == -1)
5270 22 encoding = "utf-32-le";
5271
2/2
✓ Branch 0 taken 22 times.
✓ Branch 1 taken 22 times.
44 else if (byteorder == 1)
5272 22 encoding = "utf-32-be";
5273 else
5274 22 encoding = "utf-32";
5275
5276
1/2
✓ Branch 0 taken 66 times.
✗ Branch 1 not taken.
66 if (kind == PyUnicode_1BYTE_KIND) {
5277 66 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5278 66 goto done;
5279 }
5280
5281 pos = 0;
5282 while (pos < len) {
5283 Py_ssize_t newpos, repsize, moreunits;
5284
5285 if (kind == PyUnicode_2BYTE_KIND) {
5286 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5287 &out, native_ordering);
5288 }
5289 else {
5290 assert(kind == PyUnicode_4BYTE_KIND);
5291 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5292 &out, native_ordering);
5293 }
5294 if (pos == len)
5295 break;
5296
5297 rep = unicode_encode_call_errorhandler(
5298 errors, &errorHandler,
5299 encoding, "surrogates not allowed",
5300 str, &exc, pos, pos + 1, &newpos);
5301 if (!rep)
5302 goto error;
5303
5304 if (PyBytes_Check(rep)) {
5305 repsize = PyBytes_GET_SIZE(rep);
5306 if (repsize & 3) {
5307 raise_encode_exception(&exc, encoding,
5308 str, pos, pos + 1,
5309 "surrogates not allowed");
5310 goto error;
5311 }
5312 moreunits = repsize / 4;
5313 }
5314 else {
5315 assert(PyUnicode_Check(rep));
5316 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5317 if (!PyUnicode_IS_ASCII(rep)) {
5318 raise_encode_exception(&exc, encoding,
5319 str, pos, pos + 1,
5320 "surrogates not allowed");
5321 goto error;
5322 }
5323 }
5324 moreunits += pos - newpos;
5325 pos = newpos;
5326
5327 /* four bytes are reserved for each surrogate */
5328 if (moreunits > 0) {
5329 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5330 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5331 /* integer overflow */
5332 PyErr_NoMemory();
5333 goto error;
5334 }
5335 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
5336 goto error;
5337 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5338 }
5339
5340 if (PyBytes_Check(rep)) {
5341 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5342 out += repsize / 4;
5343 } else /* rep is unicode */ {
5344 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5345 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5346 &out, native_ordering);
5347 }
5348
5349 Py_CLEAR(rep);
5350 }
5351
5352 /* Cut back to size actually needed. This is necessary for, for example,
5353 encoding of a string containing isolated surrogates and the 'ignore'
5354 handler is used. */
5355 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5356 if (nsize != PyBytes_GET_SIZE(v))
5357 _PyBytes_Resize(&v, nsize);
5358 Py_XDECREF(errorHandler);
5359 Py_XDECREF(exc);
5360 132 done:
5361 132 return v;
5362 error:
5363 Py_XDECREF(rep);
5364 Py_XDECREF(errorHandler);
5365 Py_XDECREF(exc);
5366 Py_XDECREF(v);
5367 return NULL;
5368 }
5369
5370 PyObject *
5371 PyUnicode_AsUTF32String(PyObject *unicode)
5372 {
5373 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5374 }
5375
5376 /* --- UTF-16 Codec ------------------------------------------------------- */
5377
5378 PyObject *
5379 PyUnicode_DecodeUTF16(const char *s,
5380 Py_ssize_t size,
5381 const char *errors,
5382 int *byteorder)
5383 {
5384 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5385 }
5386
5387 PyObject *
5388 PyUnicode_DecodeUTF16Stateful(const char *s,
5389 Py_ssize_t size,
5390 const char *errors,
5391 int *byteorder,
5392 Py_ssize_t *consumed)
5393 {
5394 const char *starts = s;
5395 Py_ssize_t startinpos;
5396 Py_ssize_t endinpos;
5397 _PyUnicodeWriter writer;
5398 const unsigned char *q, *e;
5399 int bo = 0; /* assume native ordering by default */
5400 int native_ordering;
5401 const char *errmsg = "";
5402 PyObject *errorHandler = NULL;
5403 PyObject *exc = NULL;
5404 const char *encoding;
5405
5406 q = (const unsigned char *)s;
5407 e = q + size;
5408
5409 if (byteorder)
5410 bo = *byteorder;
5411
5412 /* Check for BOM marks (U+FEFF) in the input and adjust current
5413 byte order setting accordingly. In native mode, the leading BOM
5414 mark is skipped, in all other modes, it is copied to the output
5415 stream as-is (giving a ZWNBSP character). */
5416 if (bo == 0 && size >= 2) {
5417 const Py_UCS4 bom = (q[1] << 8) | q[0];
5418 if (bom == 0xFEFF) {
5419 q += 2;
5420 bo = -1;
5421 }
5422 else if (bom == 0xFFFE) {
5423 q += 2;
5424 bo = 1;
5425 }
5426 if (byteorder)
5427 *byteorder = bo;
5428 }
5429
5430 if (q == e) {
5431 if (consumed)
5432 *consumed = size;
5433 _Py_RETURN_UNICODE_EMPTY();
5434 }
5435
5436 #if PY_LITTLE_ENDIAN
5437 native_ordering = bo <= 0;
5438 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5439 #else
5440 native_ordering = bo >= 0;
5441 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5442 #endif
5443
5444 /* Note: size will always be longer than the resulting Unicode
5445 character count normally. Error handler will take care of
5446 resizing when needed. */
5447 _PyUnicodeWriter_Init(&writer);
5448 writer.min_length = (e - q + 1) / 2;
5449 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5450 goto onError;
5451
5452 while (1) {
5453 Py_UCS4 ch = 0;
5454 if (e - q >= 2) {
5455 int kind = writer.kind;
5456 if (kind == PyUnicode_1BYTE_KIND) {
5457 if (PyUnicode_IS_ASCII(writer.buffer))
5458 ch = asciilib_utf16_decode(&q, e,
5459 (Py_UCS1*)writer.data, &writer.pos,
5460 native_ordering);
5461 else
5462 ch = ucs1lib_utf16_decode(&q, e,
5463 (Py_UCS1*)writer.data, &writer.pos,
5464 native_ordering);
5465 } else if (kind == PyUnicode_2BYTE_KIND) {
5466 ch = ucs2lib_utf16_decode(&q, e,
5467 (Py_UCS2*)writer.data, &writer.pos,
5468 native_ordering);
5469 } else {
5470 assert(kind == PyUnicode_4BYTE_KIND);
5471 ch = ucs4lib_utf16_decode(&q, e,
5472 (Py_UCS4*)writer.data, &writer.pos,
5473 native_ordering);
5474 }
5475 }
5476
5477 switch (ch)
5478 {
5479 case 0:
5480 /* remaining byte at the end? (size should be even) */
5481 if (q == e || consumed)
5482 goto End;
5483 errmsg = "truncated data";
5484 startinpos = ((const char *)q) - starts;
5485 endinpos = ((const char *)e) - starts;
5486 break;
5487 /* The remaining input chars are ignored if the callback
5488 chooses to skip the input */
5489 case 1:
5490 q -= 2;
5491 if (consumed)
5492 goto End;
5493 errmsg = "unexpected end of data";
5494 startinpos = ((const char *)q) - starts;
5495 endinpos = ((const char *)e) - starts;
5496 break;
5497 case 2:
5498 errmsg = "illegal encoding";
5499 startinpos = ((const char *)q) - 2 - starts;
5500 endinpos = startinpos + 2;
5501 break;
5502 case 3:
5503 errmsg = "illegal UTF-16 surrogate";
5504 startinpos = ((const char *)q) - 4 - starts;
5505 endinpos = startinpos + 2;
5506 break;
5507 default:
5508 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5509 goto onError;
5510 continue;
5511 }
5512
5513 if (unicode_decode_call_errorhandler_writer(
5514 errors,
5515 &errorHandler,
5516 encoding, errmsg,
5517 &starts,
5518 (const char **)&e,
5519 &startinpos,
5520 &endinpos,
5521 &exc,
5522 (const char **)&q,
5523 &writer))
5524 goto onError;
5525 }
5526
5527 End:
5528 if (consumed)
5529 *consumed = (const char *)q-starts;
5530
5531 Py_XDECREF(errorHandler);
5532 Py_XDECREF(exc);
5533 return _PyUnicodeWriter_Finish(&writer);
5534
5535 onError:
5536 _PyUnicodeWriter_Dealloc(&writer);
5537 Py_XDECREF(errorHandler);
5538 Py_XDECREF(exc);
5539 return NULL;
5540 }
5541
5542 PyObject *
5543 132 _PyUnicode_EncodeUTF16(PyObject *str,
5544 const char *errors,
5545 int byteorder)
5546 {
5547 int kind;
5548 const void *data;
5549 Py_ssize_t len;
5550 PyObject *v;
5551 unsigned short *out;
5552 Py_ssize_t pairs;
5553 #if PY_BIG_ENDIAN
5554 int native_ordering = byteorder >= 0;
5555 #else
5556 132 int native_ordering = byteorder <= 0;
5557 #endif
5558 const char *encoding;
5559 Py_ssize_t nsize, pos;
5560 132 PyObject *errorHandler = NULL;
5561 132 PyObject *exc = NULL;
5562 132 PyObject *rep = NULL;
5563
5564
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 132 times.
132 if (!PyUnicode_Check(str)) {
5565 PyErr_BadArgument();
5566 return NULL;
5567 }
5568 132 kind = PyUnicode_KIND(str);
5569 132 data = PyUnicode_DATA(str);
5570 132 len = PyUnicode_GET_LENGTH(str);
5571
5572 132 pairs = 0;
5573
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 132 times.
132 if (kind == PyUnicode_4BYTE_KIND) {
5574 const Py_UCS4 *in = (const Py_UCS4 *)data;
5575 const Py_UCS4 *end = in + len;
5576 while (in < end) {
5577 if (*in++ >= 0x10000) {
5578 pairs++;
5579 }
5580 }
5581 }
5582
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 132 times.
132 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5583 return PyErr_NoMemory();
5584 }
5585 132 nsize = len + pairs + (byteorder == 0);
5586 132 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5587
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 132 times.
132 if (v == NULL) {
5588 return NULL;
5589 }
5590
5591 /* output buffer is 2-bytes aligned */
5592 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5593 132 out = (unsigned short *)PyBytes_AS_STRING(v);
5594
2/2
✓ Branch 0 taken 44 times.
✓ Branch 1 taken 88 times.
132 if (byteorder == 0) {
5595 44 *out++ = 0xFEFF;
5596 }
5597
2/2
✓ Branch 0 taken 66 times.
✓ Branch 1 taken 66 times.
132 if (len == 0) {
5598 66 goto done;
5599 }
5600
5601
1/2
✓ Branch 0 taken 66 times.
✗ Branch 1 not taken.
66 if (kind == PyUnicode_1BYTE_KIND) {
5602 66 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5603 66 goto done;
5604 }
5605
5606 if (byteorder < 0) {
5607 encoding = "utf-16-le";
5608 }
5609 else if (byteorder > 0) {
5610 encoding = "utf-16-be";
5611 }
5612 else {
5613 encoding = "utf-16";
5614 }
5615
5616 pos = 0;
5617 while (pos < len) {
5618 Py_ssize_t newpos, repsize, moreunits;
5619
5620 if (kind == PyUnicode_2BYTE_KIND) {
5621 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5622 &out, native_ordering);
5623 }
5624 else {
5625 assert(kind == PyUnicode_4BYTE_KIND);
5626 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5627 &out, native_ordering);
5628 }
5629 if (pos == len)
5630 break;
5631
5632 rep = unicode_encode_call_errorhandler(
5633 errors, &errorHandler,
5634 encoding, "surrogates not allowed",
5635 str, &exc, pos, pos + 1, &newpos);
5636 if (!rep)
5637 goto error;
5638
5639 if (PyBytes_Check(rep)) {
5640 repsize = PyBytes_GET_SIZE(rep);
5641 if (repsize & 1) {
5642 raise_encode_exception(&exc, encoding,
5643 str, pos, pos + 1,
5644 "surrogates not allowed");
5645 goto error;
5646 }
5647 moreunits = repsize / 2;
5648 }
5649 else {
5650 assert(PyUnicode_Check(rep));
5651 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5652 if (!PyUnicode_IS_ASCII(rep)) {
5653 raise_encode_exception(&exc, encoding,
5654 str, pos, pos + 1,
5655 "surrogates not allowed");
5656 goto error;
5657 }
5658 }
5659 moreunits += pos - newpos;
5660 pos = newpos;
5661
5662 /* two bytes are reserved for each surrogate */
5663 if (moreunits > 0) {
5664 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5665 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
5666 /* integer overflow */
5667 PyErr_NoMemory();
5668 goto error;
5669 }
5670 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
5671 goto error;
5672 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5673 }
5674
5675 if (PyBytes_Check(rep)) {
5676 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5677 out += repsize / 2;
5678 } else /* rep is unicode */ {
5679 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5680 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5681 &out, native_ordering);
5682 }
5683
5684 Py_CLEAR(rep);
5685 }
5686
5687 /* Cut back to size actually needed. This is necessary for, for example,
5688 encoding of a string containing isolated surrogates and the 'ignore' handler
5689 is used. */
5690 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5691 if (nsize != PyBytes_GET_SIZE(v))
5692 _PyBytes_Resize(&v, nsize);
5693 Py_XDECREF(errorHandler);
5694 Py_XDECREF(exc);
5695 132 done:
5696 132 return v;
5697 error:
5698 Py_XDECREF(rep);
5699 Py_XDECREF(errorHandler);
5700 Py_XDECREF(exc);
5701 Py_XDECREF(v);
5702 return NULL;
5703 #undef STORECHAR
5704 }
5705
5706 PyObject *
5707 PyUnicode_AsUTF16String(PyObject *unicode)
5708 {
5709 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5710 }
5711
5712 /* --- Unicode Escape Codec ----------------------------------------------- */
5713
5714 static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
5715
5716 PyObject *
5717 57648 _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
5718 Py_ssize_t size,
5719 const char *errors,
5720 Py_ssize_t *consumed,
5721 const char **first_invalid_escape)
5722 {
5723 57648 const char *starts = s;
5724 _PyUnicodeWriter writer;
5725 const char *end;
5726 57648 PyObject *errorHandler = NULL;
5727 57648 PyObject *exc = NULL;
5728
5729 // so we can remember if we've seen an invalid escape char or not
5730 57648 *first_invalid_escape = NULL;
5731
5732
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 57648 times.
57648 if (size == 0) {
5733 if (consumed) {
5734 *consumed = 0;
5735 }
5736 _Py_RETURN_UNICODE_EMPTY();
5737 }
5738 /* Escaped strings will always be longer than the resulting
5739 Unicode string, so we start with size here and then reduce the
5740 length after conversion to the true value.
5741 (but if the error callback returns a long replacement string
5742 we'll have to allocate more space) */
5743 57648 _PyUnicodeWriter_Init(&writer);
5744 57648 writer.min_length = size;
5745
3/8
✗ Branch 0 not taken.
✓ Branch 1 taken 57648 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 57648 times.
✗ Branch 5 not taken.
✗ Branch 7 not taken.
✓ Branch 8 taken 57648 times.
57648 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5746 goto onError;
5747 }
5748
5749 57648 end = s + size;
5750
2/2
✓ Branch 0 taken 923420 times.
✓ Branch 1 taken 57648 times.
981068 while (s < end) {
5751 923420 unsigned char c = (unsigned char) *s++;
5752 Py_UCS4 ch;
5753 int count;
5754 const char *message;
5755
5756 #define WRITE_ASCII_CHAR(ch) \
5757 do { \
5758 assert(ch <= 127); \
5759 assert(writer.pos < writer.size); \
5760 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5761 } while(0)
5762
5763 #define WRITE_CHAR(ch) \
5764 do { \
5765 if (ch <= writer.maxchar) { \
5766 assert(writer.pos < writer.size); \
5767 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5768 } \
5769 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5770 goto onError; \
5771 } \
5772 } while(0)
5773
5774 /* Non-escape characters are interpreted as Unicode ordinals */
5775
2/2
✓ Branch 0 taken 842166 times.
✓ Branch 1 taken 81254 times.
923420 if (c != '\\') {
5776
1/4
✓ Branch 0 taken 842166 times.
✗ Branch 1 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
842166 WRITE_CHAR(c);
5777 923420 continue;
5778 }
5779
5780 81254 Py_ssize_t startinpos = s - starts - 1;
5781 /* \ - Escapes */
5782
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 81254 times.
81254 if (s >= end) {
5783 message = "\\ at end of string";
5784 goto incomplete;
5785 }
5786 81254 c = (unsigned char) *s++;
5787
5788 assert(writer.pos < writer.size);
5789
17/17
✓ Branch 0 taken 4031 times.
✓ Branch 1 taken 4167 times.
✓ Branch 2 taken 305 times.
✓ Branch 3 taken 473 times.
✓ Branch 4 taken 163 times.
✓ Branch 5 taken 167 times.
✓ Branch 6 taken 846 times.
✓ Branch 7 taken 11338 times.
✓ Branch 8 taken 378 times.
✓ Branch 9 taken 4 times.
✓ Branch 10 taken 13 times.
✓ Branch 11 taken 11885 times.
✓ Branch 12 taken 2655 times.
✓ Branch 13 taken 25983 times.
✓ Branch 14 taken 18703 times.
✓ Branch 15 taken 142 times.
✓ Branch 16 taken 1 times.
81254 switch (c) {
5790
5791 /* \x escapes */
5792 4031 case '\n': continue;
5793 4167 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5794 305 case '\'': WRITE_ASCII_CHAR('\''); continue;
5795 473 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5796 163 case 'b': WRITE_ASCII_CHAR('\b'); continue;
5797 /* FF */
5798 167 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5799 846 case 't': WRITE_ASCII_CHAR('\t'); continue;
5800 11338 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5801 378 case 'r': WRITE_ASCII_CHAR('\r'); continue;
5802 /* VT */
5803 4 case 'v': WRITE_ASCII_CHAR('\013'); continue;
5804 /* BEL, not classic C */
5805 13 case 'a': WRITE_ASCII_CHAR('\007'); continue;
5806
5807 /* \OOO (octal) escapes */
5808 11885 case '0': case '1': case '2': case '3':
5809 case '4': case '5': case '6': case '7':
5810 11885 ch = c - '0';
5811
6/6
✓ Branch 0 taken 11516 times.
✓ Branch 1 taken 369 times.
✓ Branch 2 taken 11018 times.
✓ Branch 3 taken 498 times.
✓ Branch 4 taken 1621 times.
✓ Branch 5 taken 9397 times.
11885 if (s < end && '0' <= *s && *s <= '7') {
5812 1621 ch = (ch<<3) + *s++ - '0';
5813
6/6
✓ Branch 0 taken 1535 times.
✓ Branch 1 taken 86 times.
✓ Branch 2 taken 1516 times.
✓ Branch 3 taken 19 times.
✓ Branch 4 taken 67 times.
✓ Branch 5 taken 1449 times.
1621 if (s < end && '0' <= *s && *s <= '7') {
5814 67 ch = (ch<<3) + *s++ - '0';
5815 }
5816 }
5817
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 11885 times.
11885 if (ch > 0377) {
5818 if (*first_invalid_escape == NULL) {
5819 *first_invalid_escape = s-3; /* Back up 3 chars, since we've
5820 already incremented s. */
5821 }
5822 }
5823
3/4
✓ Branch 0 taken 11884 times.
✓ Branch 1 taken 1 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 1 times.
11885 WRITE_CHAR(ch);
5824 11885 continue;
5825
5826 /* hex escapes */
5827 /* \xXX */
5828 2655 case 'x':
5829 2655 count = 2;
5830 2655 message = "truncated \\xXX escape";
5831 2655 goto hexescape;
5832
5833 /* \uXXXX */
5834 25983 case 'u':
5835 25983 count = 4;
5836 25983 message = "truncated \\uXXXX escape";
5837 25983 goto hexescape;
5838
5839 /* \UXXXXXXXX */
5840 18703 case 'U':
5841 18703 count = 8;
5842 18703 message = "truncated \\UXXXXXXXX escape";
5843 47341 hexescape:
5844
2/2
✓ Branch 0 taken 258866 times.
✓ Branch 1 taken 47341 times.
306207 for (ch = 0; count; ++s, --count) {
5845
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 258866 times.
258866 if (s >= end) {
5846 goto incomplete;
5847 }
5848 258866 c = (unsigned char)*s;
5849 258866 ch <<= 4;
5850
3/4
✓ Branch 0 taken 258866 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 204258 times.
✓ Branch 3 taken 54608 times.
258866 if (c >= '0' && c <= '9') {
5851 204258 ch += c - '0';
5852 }
5853
3/4
✓ Branch 0 taken 52242 times.
✓ Branch 1 taken 2366 times.
✓ Branch 2 taken 52242 times.
✗ Branch 3 not taken.
54608 else if (c >= 'a' && c <= 'f') {
5854 52242 ch += c - ('a' - 10);
5855 }
5856
2/4
✓ Branch 0 taken 2366 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2366 times.
✗ Branch 3 not taken.
2366 else if (c >= 'A' && c <= 'F') {
5857 2366 ch += c - ('A' - 10);
5858 }
5859 else {
5860 goto error;
5861 }
5862 }
5863
5864 /* when we get here, ch is a 32-bit unicode character */
5865
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 47341 times.
47341 if (ch > MAX_UNICODE) {
5866 message = "illegal Unicode character";
5867 goto error;
5868 }
5869
5870
3/4
✓ Branch 0 taken 29270 times.
✓ Branch 1 taken 18071 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 18071 times.
47341 WRITE_CHAR(ch);
5871 47341 continue;
5872
5873 /* \N{name} */
5874 142 case 'N':
5875
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 141 times.
142 if (ucnhash_capi == NULL) {
5876 /* load the unicode data module */
5877 1 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5878 PyUnicodeData_CAPSULE_NAME, 1);
5879
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (ucnhash_capi == NULL) {
5880 PyErr_SetString(
5881 PyExc_UnicodeError,
5882 "\\N escapes not supported (can't load unicodedata module)"
5883 );
5884 goto onError;
5885 }
5886 }
5887
5888 142 message = "malformed \\N character escape";
5889
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 142 times.
142 if (s >= end) {
5890 goto incomplete;
5891 }
5892
1/2
✓ Branch 0 taken 142 times.
✗ Branch 1 not taken.
142 if (*s == '{') {
5893 142 const char *start = ++s;
5894 size_t namelen;
5895 /* look for the closing brace */
5896
3/4
✓ Branch 0 taken 3306 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 3164 times.
✓ Branch 3 taken 142 times.
3306 while (s < end && *s != '}')
5897 3164 s++;
5898
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 142 times.
142 if (s >= end) {
5899 goto incomplete;
5900 }
5901 142 namelen = s - start;
5902
1/2
✓ Branch 0 taken 142 times.
✗ Branch 1 not taken.
142 if (namelen) {
5903 /* found a name. look it up in the unicode database */
5904 142 s++;
5905 142 ch = 0xffffffff; /* in case 'getcode' messes up */
5906
2/4
✓ Branch 0 taken 142 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 142 times.
✗ Branch 3 not taken.
284 if (namelen <= INT_MAX &&
5907 142 ucnhash_capi->getcode(start, (int)namelen,
5908 &ch, 0)) {
5909 assert(ch <= MAX_UNICODE);
5910
3/4
✓ Branch 0 taken 20 times.
✓ Branch 1 taken 122 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 122 times.
142 WRITE_CHAR(ch);
5911 142 continue;
5912 }
5913 message = "unknown Unicode character name";
5914 }
5915 }
5916 goto error;
5917
5918 1 default:
5919
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (*first_invalid_escape == NULL) {
5920 1 *first_invalid_escape = s-1; /* Back up one char, since we've
5921 already incremented s. */
5922 }
5923 1 WRITE_ASCII_CHAR('\\');
5924
1/4
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
1 WRITE_CHAR(c);
5925 1 continue;
5926 }
5927
5928 incomplete:
5929 if (consumed) {
5930 *consumed = startinpos;
5931 break;
5932 }
5933 error:;
5934 Py_ssize_t endinpos = s-starts;
5935 writer.min_length = end - s + writer.pos;
5936 if (unicode_decode_call_errorhandler_writer(
5937 errors, &errorHandler,
5938 "unicodeescape", message,
5939 &starts, &end, &startinpos, &endinpos, &exc, &s,
5940 &writer)) {
5941 goto onError;
5942 }
5943 assert(end - s <= writer.size - writer.pos);
5944
5945 #undef WRITE_ASCII_CHAR
5946 #undef WRITE_CHAR
5947 }
5948
5949 57648 Py_XDECREF(errorHandler);
5950 57648 Py_XDECREF(exc);
5951 57648 return _PyUnicodeWriter_Finish(&writer);
5952
5953 onError:
5954 _PyUnicodeWriter_Dealloc(&writer);
5955 Py_XDECREF(errorHandler);
5956 Py_XDECREF(exc);
5957 return NULL;
5958 }
5959
5960 PyObject *
5961 _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
5962 Py_ssize_t size,
5963 const char *errors,
5964 Py_ssize_t *consumed)
5965 {
5966 const char *first_invalid_escape;
5967 PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
5968 consumed,
5969 &first_invalid_escape);
5970 if (result == NULL)
5971 return NULL;
5972 if (first_invalid_escape != NULL) {
5973 unsigned char c = *first_invalid_escape;
5974 if ('4' <= c && c <= '7') {
5975 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
5976 "invalid octal escape sequence '\\%.3s'",
5977 first_invalid_escape) < 0)
5978 {
5979 Py_DECREF(result);
5980 return NULL;
5981 }
5982 }
5983 else {
5984 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
5985 "invalid escape sequence '\\%c'",
5986 c) < 0)
5987 {
5988 Py_DECREF(result);
5989 return NULL;
5990 }
5991 }
5992 }
5993 return result;
5994 }
5995
5996 PyObject *
5997 PyUnicode_DecodeUnicodeEscape(const char *s,
5998 Py_ssize_t size,
5999 const char *errors)
6000 {
6001 return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6002 }
6003
6004 /* Return a Unicode-Escape string version of the Unicode object. */
6005
6006 PyObject *
6007 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6008 {
6009 Py_ssize_t i, len;
6010 PyObject *repr;
6011 char *p;
6012 int kind;
6013 const void *data;
6014 Py_ssize_t expandsize;
6015
6016 /* Initial allocation is based on the longest-possible character
6017 escape.
6018
6019 For UCS1 strings it's '\xxx', 4 bytes per source character.
6020 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6021 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6022 */
6023
6024 if (!PyUnicode_Check(unicode)) {
6025 PyErr_BadArgument();
6026 return NULL;
6027 }
6028
6029 len = PyUnicode_GET_LENGTH(unicode);
6030 if (len == 0) {
6031 return PyBytes_FromStringAndSize(NULL, 0);
6032 }
6033
6034 kind = PyUnicode_KIND(unicode);
6035 data = PyUnicode_DATA(unicode);
6036 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6037 bytes, and 1 byte characters 4. */
6038 expandsize = kind * 2 + 2;
6039 if (len > PY_SSIZE_T_MAX / expandsize) {
6040 return PyErr_NoMemory();
6041 }
6042 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6043 if (repr == NULL) {
6044 return NULL;
6045 }
6046
6047 p = PyBytes_AS_STRING(repr);
6048 for (i = 0; i < len; i++) {
6049 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6050
6051 /* U+0000-U+00ff range */
6052 if (ch < 0x100) {
6053 if (ch >= ' ' && ch < 127) {
6054 if (ch != '\\') {
6055 /* Copy printable US ASCII as-is */
6056 *p++ = (char) ch;
6057 }
6058 /* Escape backslashes */
6059 else {
6060 *p++ = '\\';
6061 *p++ = '\\';
6062 }
6063 }
6064
6065 /* Map special whitespace to '\t', \n', '\r' */
6066 else if (ch == '\t') {
6067 *p++ = '\\';
6068 *p++ = 't';
6069 }
6070 else if (ch == '\n') {
6071 *p++ = '\\';
6072 *p++ = 'n';
6073 }
6074 else if (ch == '\r') {
6075 *p++ = '\\';
6076 *p++ = 'r';
6077 }
6078
6079 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6080 else {
6081 *p++ = '\\';
6082 *p++ = 'x';
6083 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6084 *p++ = Py_hexdigits[ch & 0x000F];
6085 }
6086 }
6087 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6088 else if (ch < 0x10000) {
6089 *p++ = '\\';
6090 *p++ = 'u';
6091 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6092 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6093 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6094 *p++ = Py_hexdigits[ch & 0x000F];
6095 }
6096 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6097 else {
6098
6099 /* Make sure that the first two digits are zero */
6100 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6101 *p++ = '\\';
6102 *p++ = 'U';
6103 *p++ = '0';
6104 *p++ = '0';
6105 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6106 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6107 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6108 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6109 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6110 *p++ = Py_hexdigits[ch & 0x0000000F];
6111 }
6112 }
6113
6114 assert(p - PyBytes_AS_STRING(repr) > 0);
6115 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6116 return NULL;
6117 }
6118 return repr;
6119 }
6120
6121 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6122
6123 PyObject *
6124 _PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6125 Py_ssize_t size,
6126 const char *errors,
6127 Py_ssize_t *consumed)
6128 {
6129 const char *starts = s;
6130 _PyUnicodeWriter writer;
6131 const char *end;
6132 PyObject *errorHandler = NULL;
6133 PyObject *exc = NULL;
6134
6135 if (size == 0) {
6136 if (consumed) {
6137 *consumed = 0;
6138 }
6139 _Py_RETURN_UNICODE_EMPTY();
6140 }
6141
6142 /* Escaped strings will always be longer than the resulting
6143 Unicode string, so we start with size here and then reduce the
6144 length after conversion to the true value. (But decoding error
6145 handler might have to resize the string) */
6146 _PyUnicodeWriter_Init(&writer);
6147 writer.min_length = size;
6148 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6149 goto onError;
6150 }
6151
6152 end = s + size;
6153 while (s < end) {
6154 unsigned char c = (unsigned char) *s++;
6155 Py_UCS4 ch;
6156 int count;
6157 const char *message;
6158
6159 #define WRITE_CHAR(ch) \
6160 do { \
6161 if (ch <= writer.maxchar) { \
6162 assert(writer.pos < writer.size); \
6163 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6164 } \
6165 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6166 goto onError; \
6167 } \
6168 } while(0)
6169
6170 /* Non-escape characters are interpreted as Unicode ordinals */
6171 if (c != '\\' || (s >= end && !consumed)) {
6172 WRITE_CHAR(c);
6173 continue;
6174 }
6175
6176 Py_ssize_t startinpos = s - starts - 1;
6177 /* \ - Escapes */
6178 if (s >= end) {
6179 assert(consumed);
6180 // Set message to silent compiler warning.
6181 // Actually it is never used.
6182 message = "\\ at end of string";
6183 goto incomplete;
6184 }
6185
6186 c = (unsigned char) *s++;
6187 if (c == 'u') {
6188 count = 4;
6189 message = "truncated \\uXXXX escape";
6190 }
6191 else if (c == 'U') {
6192 count = 8;
6193 message = "truncated \\UXXXXXXXX escape";
6194 }
6195 else {
6196 assert(writer.pos < writer.size);
6197 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6198 WRITE_CHAR(c);
6199 continue;
6200 }
6201
6202 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6203 for (ch = 0; count; ++s, --count) {
6204 if (s >= end) {
6205 goto incomplete;
6206 }
6207 c = (unsigned char)*s;
6208 ch <<= 4;
6209 if (c >= '0' && c <= '9') {
6210 ch += c - '0';
6211 }
6212 else if (c >= 'a' && c <= 'f') {
6213 ch += c - ('a' - 10);
6214 }
6215 else if (c >= 'A' && c <= 'F') {
6216 ch += c - ('A' - 10);
6217 }
6218 else {
6219 goto error;
6220 }
6221 }
6222 if (ch > MAX_UNICODE) {
6223 message = "\\Uxxxxxxxx out of range";
6224 goto error;
6225 }
6226 WRITE_CHAR(ch);
6227 continue;
6228
6229 incomplete:
6230 if (consumed) {
6231 *consumed = startinpos;
6232 break;
6233 }
6234 error:;
6235 Py_ssize_t endinpos = s-starts;
6236 writer.min_length = end - s + writer.pos;
6237 if (unicode_decode_call_errorhandler_writer(
6238 errors, &errorHandler,
6239 "rawunicodeescape", message,
6240 &starts, &end, &startinpos, &endinpos, &exc, &s,
6241 &writer)) {
6242 goto onError;
6243 }
6244 assert(end - s <= writer.size - writer.pos);
6245
6246 #undef WRITE_CHAR
6247 }
6248 Py_XDECREF(errorHandler);
6249 Py_XDECREF(exc);
6250 return _PyUnicodeWriter_Finish(&writer);
6251
6252 onError:
6253 _PyUnicodeWriter_Dealloc(&writer);
6254 Py_XDECREF(errorHandler);
6255 Py_XDECREF(exc);
6256 return NULL;
6257 }
6258
6259 PyObject *
6260 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6261 Py_ssize_t size,
6262 const char *errors)
6263 {
6264 return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6265 }
6266
6267
6268 PyObject *
6269 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6270 {
6271 PyObject *repr;
6272 char *p;
6273 Py_ssize_t expandsize, pos;
6274 int kind;
6275 const void *data;
6276 Py_ssize_t len;
6277
6278 if (!PyUnicode_Check(unicode)) {
6279 PyErr_BadArgument();
6280 return NULL;
6281 }
6282 kind = PyUnicode_KIND(unicode);
6283 data = PyUnicode_DATA(unicode);
6284 len = PyUnicode_GET_LENGTH(unicode);
6285 if (kind == PyUnicode_1BYTE_KIND) {
6286 return PyBytes_FromStringAndSize(data, len);
6287 }
6288
6289 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6290 bytes, and 1 byte characters 4. */
6291 expandsize = kind * 2 + 2;
6292
6293 if (len > PY_SSIZE_T_MAX / expandsize) {
6294 return PyErr_NoMemory();
6295 }
6296 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6297 if (repr == NULL) {
6298 return NULL;
6299 }
6300 if (len == 0) {
6301 return repr;
6302 }
6303
6304 p = PyBytes_AS_STRING(repr);
6305 for (pos = 0; pos < len; pos++) {
6306 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6307
6308 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6309 if (ch < 0x100) {
6310 *p++ = (char) ch;
6311 }
6312 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6313 else if (ch < 0x10000) {
6314 *p++ = '\\';
6315 *p++ = 'u';
6316 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6317 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6318 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6319 *p++ = Py_hexdigits[ch & 15];
6320 }
6321 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6322 else {
6323 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6324 *p++ = '\\';
6325 *p++ = 'U';
6326 *p++ = '0';
6327 *p++ = '0';
6328 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6329 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6330 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6331 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6332 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6333 *p++ = Py_hexdigits[ch & 15];
6334 }
6335 }
6336
6337 assert(p > PyBytes_AS_STRING(repr));
6338 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6339 return NULL;
6340 }
6341 return repr;
6342 }
6343
6344 /* --- Latin-1 Codec ------------------------------------------------------ */
6345
6346 PyObject *
6347 49281 PyUnicode_DecodeLatin1(const char *s,
6348 Py_ssize_t size,
6349 const char *errors)
6350 {
6351 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6352 49281 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
6353 }
6354
6355 /* create or adjust a UnicodeEncodeError */
6356 static void
6357 1 make_encode_exception(PyObject **exceptionObject,
6358 const char *encoding,
6359 PyObject *unicode,
6360 Py_ssize_t startpos, Py_ssize_t endpos,
6361 const char *reason)
6362 {
6363
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (*exceptionObject == NULL) {
6364 1 *exceptionObject = PyObject_CallFunction(
6365 PyExc_UnicodeEncodeError, "sOnns",
6366 encoding, unicode, startpos, endpos, reason);
6367 }
6368 else {
6369 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6370 goto onError;
6371 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6372 goto onError;
6373 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6374 goto onError;
6375 return;
6376 onError:
6377 Py_CLEAR(*exceptionObject);
6378 }
6379 }
6380
6381 /* raises a UnicodeEncodeError */
6382 static void
6383 raise_encode_exception(PyObject **exceptionObject,
6384 const char *encoding,
6385 PyObject *unicode,
6386 Py_ssize_t startpos, Py_ssize_t endpos,
6387 const char *reason)
6388 {
6389 make_encode_exception(exceptionObject,
6390 encoding, unicode, startpos, endpos, reason);
6391 if (*exceptionObject != NULL)
6392 PyCodec_StrictErrors(*exceptionObject);
6393 }
6394
6395 /* error handling callback helper:
6396 build arguments, call the callback and check the arguments,
6397 put the result into newpos and return the replacement string, which
6398 has to be freed by the caller */
6399 static PyObject *
6400 1 unicode_encode_call_errorhandler(const char *errors,
6401 PyObject **errorHandler,
6402 const char *encoding, const char *reason,
6403 PyObject *unicode, PyObject **exceptionObject,
6404 Py_ssize_t startpos, Py_ssize_t endpos,
6405 Py_ssize_t *newpos)
6406 {
6407 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6408 Py_ssize_t len;
6409 PyObject *restuple;
6410 PyObject *resunicode;
6411
6412
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (*errorHandler == NULL) {
6413 1 *errorHandler = PyCodec_LookupError(errors);
6414
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (*errorHandler == NULL)
6415 return NULL;
6416 }
6417
6418 1 len = PyUnicode_GET_LENGTH(unicode);
6419
6420 1 make_encode_exception(exceptionObject,
6421 encoding, unicode, startpos, endpos, reason);
6422
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (*exceptionObject == NULL)
6423 return NULL;
6424
6425 1 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
6426
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (restuple == NULL)
6427 1 return NULL;
6428 if (!PyTuple_Check(restuple)) {
6429 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6430 Py_DECREF(restuple);
6431 return NULL;
6432 }
6433 if (!PyArg_ParseTuple(restuple, argparse,
6434 &resunicode, newpos)) {
6435 Py_DECREF(restuple);
6436 return NULL;
6437 }
6438 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6439 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6440 Py_DECREF(restuple);
6441 return NULL;
6442 }
6443 if (*newpos<0)
6444 *newpos = len + *newpos;
6445 if (*newpos<0 || *newpos>len) {
6446 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6447 Py_DECREF(restuple);
6448 return NULL;
6449 }
6450 Py_INCREF(resunicode);
6451 Py_DECREF(restuple);
6452 return resunicode;
6453 }
6454
6455 static PyObject *
6456 1 unicode_encode_ucs1(PyObject *unicode,
6457 const char *errors,
6458 const Py_UCS4 limit)
6459 {
6460 /* input state */
6461 1 Py_ssize_t pos=0, size;
6462 int kind;
6463 const void *data;
6464 /* pointer into the output */
6465 char *str;
6466
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6467
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6468 1 PyObject *error_handler_obj = NULL;
6469 1 PyObject *exc = NULL;
6470 1 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6471 1 PyObject *rep = NULL;
6472 /* output object */
6473 _PyBytesWriter writer;
6474
6475 1 size = PyUnicode_GET_LENGTH(unicode);
6476 1 kind = PyUnicode_KIND(unicode);
6477 1 data = PyUnicode_DATA(unicode);
6478 /* allocate enough for a simple encoding without
6479 replacements, if we need more, we'll resize */
6480
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (size == 0)
6481 return PyBytes_FromStringAndSize(NULL, 0);
6482
6483 1 _PyBytesWriter_Init(&writer);
6484 1 str = _PyBytesWriter_Alloc(&writer, size);
6485
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (str == NULL)
6486 return NULL;
6487
6488
2/2
✓ Branch 0 taken 45 times.
✓ Branch 1 taken 1 times.
46 while (pos < size) {
6489 45 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6490
6491 /* can we encode this? */
6492
2/2
✓ Branch 0 taken 44 times.
✓ Branch 1 taken 1 times.
45 if (ch < limit) {
6493 /* no overflow check, because we know that the space is enough */
6494 44 *str++ = (char)ch;
6495 44 ++pos;
6496 }
6497 else {
6498 Py_ssize_t newpos, i;
6499 /* startpos for collecting unencodable chars */
6500 1 Py_ssize_t collstart = pos;
6501 1 Py_ssize_t collend = collstart + 1;
6502 /* find all unecodable characters */
6503
6504
3/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 3 taken 1 times.
✓ Branch 4 taken 1 times.
2 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6505 1 ++collend;
6506
6507 /* Only overallocate the buffer if it's not the last write */
6508 1 writer.overallocate = (collend < size);
6509
6510 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6511
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (error_handler == _Py_ERROR_UNKNOWN)
6512 1 error_handler = _Py_GetErrorHandler(errors);
6513
6514
1/7
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 1 times.
✗ Branch 6 not taken.
1 switch (error_handler) {
6515 case _Py_ERROR_STRICT:
6516 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6517 goto onError;
6518
6519 case _Py_ERROR_REPLACE:
6520 memset(str, '?', collend - collstart);
6521 str += (collend - collstart);
6522 /* fall through */
6523 case _Py_ERROR_IGNORE:
6524 pos = collend;
6525 break;
6526
6527 case _Py_ERROR_BACKSLASHREPLACE:
6528 /* subtract preallocated bytes */
6529 writer.min_size -= (collend - collstart);
6530 str = backslashreplace(&writer, str,
6531 unicode, collstart, collend);
6532 if (str == NULL)
6533 goto onError;
6534 pos = collend;
6535 break;
6536
6537 case _Py_ERROR_XMLCHARREFREPLACE:
6538 /* subtract preallocated bytes */
6539 writer.min_size -= (collend - collstart);
6540 str = xmlcharrefreplace(&writer, str,
6541 unicode, collstart, collend);
6542 if (str == NULL)
6543 goto onError;
6544 pos = collend;
6545 break;
6546
6547 1 case _Py_ERROR_SURROGATEESCAPE:
6548
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1 times.
3 for (i = collstart; i < collend; ++i) {
6549 2 ch = PyUnicode_READ(kind, data, i);
6550
2/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
2 if (ch < 0xdc80 || 0xdcff < ch) {
6551 /* Not a UTF-8b surrogate */
6552 break;
6553 }
6554 2 *str++ = (char)(ch - 0xdc00);
6555 2 ++pos;
6556 }
6557
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (i >= collend)
6558 1 break;
6559 collstart = pos;
6560 assert(collstart != collend);
6561 /* fall through */
6562
6563 default:
6564 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6565 encoding, reason, unicode, &exc,
6566 collstart, collend, &newpos);
6567 if (rep == NULL)
6568 goto onError;
6569
6570 if (newpos < collstart) {
6571 writer.overallocate = 1;
6572 str = _PyBytesWriter_Prepare(&writer, str,
6573 collstart - newpos);
6574 if (str == NULL)
6575 goto onError;
6576 }
6577 else {
6578 /* subtract preallocated bytes */
6579 writer.min_size -= newpos - collstart;
6580 /* Only overallocate the buffer if it's not the last write */
6581 writer.overallocate = (newpos < size);
6582 }
6583
6584 if (PyBytes_Check(rep)) {
6585 /* Directly copy bytes result to output. */
6586 str = _PyBytesWriter_WriteBytes(&writer, str,
6587 PyBytes_AS_STRING(rep),
6588 PyBytes_GET_SIZE(rep));
6589 }
6590 else {
6591 assert(PyUnicode_Check(rep));
6592
6593 if (limit == 256 ?
6594 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6595 !PyUnicode_IS_ASCII(rep))
6596 {
6597 /* Not all characters are smaller than limit */
6598 raise_encode_exception(&exc, encoding, unicode,
6599 collstart, collend, reason);
6600 goto onError;
6601 }
6602 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6603 str = _PyBytesWriter_WriteBytes(&writer, str,
6604 PyUnicode_DATA(rep),
6605 PyUnicode_GET_LENGTH(rep));
6606 }
6607 if (str == NULL)
6608 goto onError;
6609
6610 pos = newpos;
6611 Py_CLEAR(rep);
6612 }
6613
6614 /* If overallocation was disabled, ensure that it was the last
6615 write. Otherwise, we missed an optimization */
6616 assert(writer.overallocate || pos == size);
6617 }
6618 }
6619
6620 1 Py_XDECREF(error_handler_obj);
6621 1 Py_XDECREF(exc);
6622 1 return _PyBytesWriter_Finish(&writer, str);
6623
6624 onError:
6625 Py_XDECREF(rep);
6626 _PyBytesWriter_Dealloc(&writer);
6627 Py_XDECREF(error_handler_obj);
6628 Py_XDECREF(exc);
6629 return NULL;
6630 }
6631
6632 PyObject *
6633 113578 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6634 {
6635
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 113578 times.
113578 if (!PyUnicode_Check(unicode)) {
6636 PyErr_BadArgument();
6637 return NULL;
6638 }
6639 /* Fast path: if it is a one-byte string, construct
6640 bytes object directly. */
6641
1/2
✓ Branch 0 taken 113578 times.
✗ Branch 1 not taken.
113578 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6642 113578 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6643 PyUnicode_GET_LENGTH(unicode));
6644 /* Non-Latin-1 characters present. Defer to above function to
6645 raise the exception. */
6646 return unicode_encode_ucs1(unicode, errors, 256);
6647 }
6648
6649 PyObject*
6650 PyUnicode_AsLatin1String(PyObject *unicode)
6651 {
6652 return _PyUnicode_AsLatin1String(unicode, NULL);
6653 }
6654
6655 /* --- 7-bit ASCII Codec -------------------------------------------------- */
6656
6657 PyObject *
6658 1060462 PyUnicode_DecodeASCII(const char *s,
6659 Py_ssize_t size,
6660 const char *errors)
6661 {
6662 1060462 const char *starts = s;
6663 1060462 const char *e = s + size;
6664 1060462 PyObject *error_handler_obj = NULL;
6665 1060462 PyObject *exc = NULL;
6666 1060462 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6667
6668
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 1060446 times.
1060462 if (size == 0)
6669 16 _Py_RETURN_UNICODE_EMPTY();
6670
6671 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6672
3/4
✓ Branch 0 taken 257 times.
✓ Branch 1 taken 1060189 times.
✓ Branch 2 taken 257 times.
✗ Branch 3 not taken.
1060446 if (size == 1 && (unsigned char)s[0] < 128) {
6673 257 return get_latin1_char((unsigned char)s[0]);
6674 }
6675
6676 // Shortcut for simple case
6677 1060189 PyObject *u = PyUnicode_New(size, 127);
6678
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1060189 times.
1060189 if (u == NULL) {
6679 return NULL;
6680 }
6681 1060189 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
6682
2/2
✓ Branch 0 taken 1060188 times.
✓ Branch 1 taken 1 times.
1060189 if (outpos == size) {
6683 1060188 return u;
6684 }
6685
6686 _PyUnicodeWriter writer;
6687 1 _PyUnicodeWriter_InitWithBuffer(&writer, u);
6688 1 writer.pos = outpos;
6689
6690 1 s += outpos;
6691 1 int kind = writer.kind;
6692 1 void *data = writer.data;
6693 Py_ssize_t startinpos, endinpos;
6694
6695
2/2
✓ Branch 0 taken 1084 times.
✓ Branch 1 taken 1 times.
1085 while (s < e) {
6696 1084 unsigned char c = (unsigned char)*s;
6697
2/2
✓ Branch 0 taken 1082 times.
✓ Branch 1 taken 2 times.
1084 if (c < 128) {
6698 1082 PyUnicode_WRITE(kind, data, writer.pos, c);
6699 1082 writer.pos++;
6700 1082 ++s;
6701 1082 continue;
6702 }
6703
6704 /* byte outsize range 0x00..0x7f: call the error handler */
6705
6706
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 if (error_handler == _Py_ERROR_UNKNOWN)
6707 1 error_handler = _Py_GetErrorHandler(errors);
6708
6709
1/3
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
2 switch (error_handler)
6710 {
6711 2 case _Py_ERROR_REPLACE:
6712 case _Py_ERROR_SURROGATEESCAPE:
6713 /* Fast-path: the error handler only writes one character,
6714 but we may switch to UCS2 at the first write */
6715
3/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 1 times.
2 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6716 goto onError;
6717 2 kind = writer.kind;
6718 2 data = writer.data;
6719
6720
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 if (error_handler == _Py_ERROR_REPLACE)
6721 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6722 else
6723 2 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6724 2 writer.pos++;
6725 2 ++s;
6726 2 break;
6727
6728 case _Py_ERROR_IGNORE:
6729 ++s;
6730 break;
6731
6732 default:
6733 startinpos = s-starts;
6734 endinpos = startinpos + 1;
6735 if (unicode_decode_call_errorhandler_writer(
6736 errors, &error_handler_obj,
6737 "ascii", "ordinal not in range(128)",
6738 &starts, &e, &startinpos, &endinpos, &exc, &s,
6739 &writer))
6740 goto onError;
6741 kind = writer.kind;
6742 data = writer.data;
6743 }
6744 }
6745 1 Py_XDECREF(error_handler_obj);
6746 1 Py_XDECREF(exc);
6747 1 return _PyUnicodeWriter_Finish(&writer);
6748
6749 onError:
6750 _PyUnicodeWriter_Dealloc(&writer);
6751 Py_XDECREF(error_handler_obj);
6752 Py_XDECREF(exc);
6753 return NULL;
6754 }
6755
6756 PyObject *
6757 1034888 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6758 {
6759
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 1034888 times.
1034888 if (!PyUnicode_Check(unicode)) {
6760 PyErr_BadArgument();
6761 return NULL;
6762 }
6763 /* Fast path: if it is an ASCII-only string, construct bytes object
6764 directly. Else defer to above function to raise the exception. */
6765
2/2
✓ Branch 1 taken 1034887 times.
✓ Branch 2 taken 1 times.
1034888 if (PyUnicode_IS_ASCII(unicode))
6766 1034887 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6767 PyUnicode_GET_LENGTH(unicode));
6768 1 return unicode_encode_ucs1(unicode, errors, 128);
6769 }
6770
6771 PyObject *
6772 20859 PyUnicode_AsASCIIString(PyObject *unicode)
6773 {
6774 20859 return _PyUnicode_AsASCIIString(unicode, NULL);
6775 }
6776
6777 #ifdef MS_WINDOWS
6778
6779 /* --- MBCS codecs for Windows -------------------------------------------- */
6780
6781 #if SIZEOF_INT < SIZEOF_SIZE_T
6782 #define NEED_RETRY
6783 #endif
6784
6785 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
6786 transcoding from UTF-16), but INT_MAX / 4 performs better in
6787 both cases also and avoids partial characters overrunning the
6788 length limit in MultiByteToWideChar on Windows */
6789 #define DECODING_CHUNK_SIZE (INT_MAX/4)
6790
6791 #ifndef WC_ERR_INVALID_CHARS
6792 # define WC_ERR_INVALID_CHARS 0x0080
6793 #endif
6794
6795 static const char*
6796 code_page_name(UINT code_page, PyObject **obj)
6797 {
6798 *obj = NULL;
6799 if (code_page == CP_ACP)
6800 return "mbcs";
6801 if (code_page == CP_UTF7)
6802 return "CP_UTF7";
6803 if (code_page == CP_UTF8)
6804 return "CP_UTF8";
6805
6806 *obj = PyBytes_FromFormat("cp%u", code_page);
6807 if (*obj == NULL)
6808 return NULL;
6809 return PyBytes_AS_STRING(*obj);
6810 }
6811
6812 static DWORD
6813 decode_code_page_flags(UINT code_page)
6814 {
6815 if (code_page == CP_UTF7) {
6816 /* The CP_UTF7 decoder only supports flags=0 */
6817 return 0;
6818 }
6819 else
6820 return MB_ERR_INVALID_CHARS;
6821 }
6822
6823 /*
6824 * Decode a byte string from a Windows code page into unicode object in strict
6825 * mode.
6826 *
6827 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6828 * OSError and returns -1 on other error.
6829 */
6830 static int
6831 decode_code_page_strict(UINT code_page,
6832 wchar_t **buf,
6833 Py_ssize_t *bufsize,
6834 const char *in,
6835 int insize)
6836 {
6837 DWORD flags = MB_ERR_INVALID_CHARS;
6838 wchar_t *out;
6839 DWORD outsize;
6840
6841 /* First get the size of the result */
6842 assert(insize > 0);
6843 while ((outsize = MultiByteToWideChar(code_page, flags,
6844 in, insize, NULL, 0)) <= 0)
6845 {
6846 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
6847 goto error;
6848 }
6849 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
6850 flags = 0;
6851 }
6852
6853 /* Extend a wchar_t* buffer */
6854 Py_ssize_t n = *bufsize; /* Get the current length */
6855 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
6856 return -1;
6857 }
6858 out = *buf + n;
6859
6860 /* Do the conversion */
6861 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6862 if (outsize <= 0)
6863 goto error;
6864 return insize;
6865
6866 error:
6867 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6868 return -2;
6869 PyErr_SetFromWindowsErr(0);
6870 return -1;
6871 }
6872
6873 /*
6874 * Decode a byte string from a code page into unicode object with an error
6875 * handler.
6876 *
6877 * Returns consumed size if succeed, or raise an OSError or
6878 * UnicodeDecodeError exception and returns -1 on error.
6879 */
6880 static int
6881 decode_code_page_errors(UINT code_page,
6882 wchar_t **buf,
6883 Py_ssize_t *bufsize,
6884 const char *in, const int size,
6885 const char *errors, int final)
6886 {
6887 const char *startin = in;
6888 const char *endin = in + size;
6889 DWORD flags = MB_ERR_INVALID_CHARS;
6890 /* Ideally, we should get reason from FormatMessage. This is the Windows
6891 2000 English version of the message. */
6892 const char *reason = "No mapping for the Unicode character exists "
6893 "in the target code page.";
6894 /* each step cannot decode more than 1 character, but a character can be
6895 represented as a surrogate pair */
6896 wchar_t buffer[2], *out;
6897 int insize;
6898 Py_ssize_t outsize;
6899 PyObject *errorHandler = NULL;
6900 PyObject *exc = NULL;
6901 PyObject *encoding_obj = NULL;
6902 const char *encoding;
6903 DWORD err;
6904 int ret = -1;
6905
6906 assert(size > 0);
6907
6908 encoding = code_page_name(code_page, &encoding_obj);
6909 if (encoding == NULL)
6910 return -1;
6911
6912 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
6913 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6914 UnicodeDecodeError. */
6915 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6916 if (exc != NULL) {
6917 PyCodec_StrictErrors(exc);
6918 Py_CLEAR(exc);
6919 }
6920 goto error;
6921 }
6922
6923 /* Extend a wchar_t* buffer */
6924 Py_ssize_t n = *bufsize; /* Get the current length */
6925 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6926 PyErr_NoMemory();
6927 goto error;
6928 }
6929 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
6930 goto error;
6931 }
6932 out = *buf + n;
6933
6934 /* Decode the byte string character per character */
6935 while (in < endin)
6936 {
6937 /* Decode a character */
6938 insize = 1;
6939 do
6940 {
6941 outsize = MultiByteToWideChar(code_page, flags,
6942 in, insize,
6943 buffer, Py_ARRAY_LENGTH(buffer));
6944 if (outsize > 0)
6945 break;
6946 err = GetLastError();
6947 if (err == ERROR_INVALID_FLAGS && flags) {
6948 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
6949 flags = 0;
6950 continue;
6951 }
6952 if (err != ERROR_NO_UNICODE_TRANSLATION
6953 && err != ERROR_INSUFFICIENT_BUFFER)
6954 {
6955 PyErr_SetFromWindowsErr(0);
6956 goto error;
6957 }
6958 insize++;
6959 }
6960 /* 4=maximum length of a UTF-8 sequence */
6961 while (insize <= 4 && (in + insize) <= endin);
6962
6963 if (outsize <= 0) {
6964 Py_ssize_t startinpos, endinpos, outpos;
6965
6966 /* last character in partial decode? */
6967 if (in + insize >= endin && !final)
6968 break;
6969
6970 startinpos = in - startin;
6971 endinpos = startinpos + 1;
6972 outpos = out - *buf;
6973 if (unicode_decode_call_errorhandler_wchar(
6974 errors, &errorHandler,
6975 encoding, reason,
6976 &startin, &endin, &startinpos, &endinpos, &exc, &in,
6977 buf, bufsize, &outpos))
6978 {
6979 goto error;
6980 }
6981 out = *buf + outpos;
6982 }
6983 else {
6984 in += insize;
6985 memcpy(out, buffer, outsize * sizeof(wchar_t));
6986 out += outsize;
6987 }
6988 }
6989
6990 /* Shrink the buffer */
6991 assert(out - *buf <= *bufsize);
6992 *bufsize = out - *buf;
6993 /* (in - startin) <= size and size is an int */
6994 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
6995
6996 error:
6997 Py_XDECREF(encoding_obj);
6998 Py_XDECREF(errorHandler);
6999 Py_XDECREF(exc);
7000 return ret;
7001 }
7002
7003 static PyObject *
7004 decode_code_page_stateful(int code_page,
7005 const char *s, Py_ssize_t size,
7006 const char *errors, Py_ssize_t *consumed)
7007 {
7008 wchar_t *buf = NULL;
7009 Py_ssize_t bufsize = 0;
7010 int chunk_size, final, converted, done;
7011
7012 if (code_page < 0) {
7013 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7014 return NULL;
7015 }
7016 if (size < 0) {
7017 PyErr_BadInternalCall();
7018 return NULL;
7019 }
7020
7021 if (consumed)
7022 *consumed = 0;
7023
7024 do
7025 {
7026 #ifdef NEED_RETRY
7027 if (size > DECODING_CHUNK_SIZE) {
7028 chunk_size = DECODING_CHUNK_SIZE;
7029 final = 0;
7030 done = 0;
7031 }
7032 else
7033 #endif
7034 {
7035 chunk_size = (int)size;
7036 final = (consumed == NULL);
7037 done = 1;
7038 }
7039
7040 if (chunk_size == 0 && done) {
7041 if (buf != NULL)
7042 break;
7043 _Py_RETURN_UNICODE_EMPTY();
7044 }
7045
7046 converted = decode_code_page_strict(code_page, &buf, &bufsize,
7047 s, chunk_size);
7048 if (converted == -2)
7049 converted = decode_code_page_errors(code_page, &buf, &bufsize,
7050 s, chunk_size,
7051 errors, final);
7052 assert(converted != 0 || done);
7053
7054 if (converted < 0) {
7055 PyMem_Free(buf);
7056 return NULL;
7057 }
7058
7059 if (consumed)
7060 *consumed += converted;
7061
7062 s += converted;
7063 size -= converted;
7064 } while (!done);
7065
7066 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7067 PyMem_Free(buf);
7068 return v;
7069 }
7070
7071 PyObject *
7072 PyUnicode_DecodeCodePageStateful(int code_page,
7073 const char *s,
7074 Py_ssize_t size,
7075 const char *errors,
7076 Py_ssize_t *consumed)
7077 {
7078 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7079 }
7080
7081 PyObject *
7082 PyUnicode_DecodeMBCSStateful(const char *s,
7083 Py_ssize_t size,
7084 const char *errors,
7085 Py_ssize_t *consumed)
7086 {
7087 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7088 }
7089
7090 PyObject *
7091 PyUnicode_DecodeMBCS(const char *s,
7092 Py_ssize_t size,
7093 const char *errors)
7094 {
7095 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7096 }
7097
7098 static DWORD
7099 encode_code_page_flags(UINT code_page, const char *errors)
7100 {
7101 if (code_page == CP_UTF8) {
7102 return WC_ERR_INVALID_CHARS;
7103 }
7104 else if (code_page == CP_UTF7) {
7105 /* CP_UTF7 only supports flags=0 */
7106 return 0;
7107 }
7108 else {
7109 if (errors != NULL && strcmp(errors, "replace") == 0)
7110 return 0;
7111 else
7112 return WC_NO_BEST_FIT_CHARS;
7113 }
7114 }
7115
7116 /*
7117 * Encode a Unicode string to a Windows code page into a byte string in strict
7118 * mode.
7119 *
7120 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7121 * an OSError and returns -1 on other error.
7122 */
7123 static int
7124 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7125 PyObject *unicode, Py_ssize_t offset, int len,
7126 const char* errors)
7127 {
7128 BOOL usedDefaultChar = FALSE;
7129 BOOL *pusedDefaultChar = &usedDefaultChar;
7130 int outsize;
7131 wchar_t *p;
7132 Py_ssize_t size;
7133 const DWORD flags = encode_code_page_flags(code_page, NULL);
7134 char *out;
7135 /* Create a substring so that we can get the UTF-16 representation
7136 of just the slice under consideration. */
7137 PyObject *substring;
7138 int ret = -1;
7139
7140 assert(len > 0);
7141
7142 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7143 pusedDefaultChar = &usedDefaultChar;
7144 else
7145 pusedDefaultChar = NULL;
7146
7147 substring = PyUnicode_Substring(unicode, offset, offset+len);
7148 if (substring == NULL)
7149 return -1;
7150 p = PyUnicode_AsWideCharString(substring, &size);
7151 Py_CLEAR(substring);
7152 if (p == NULL) {
7153 return -1;
7154 }
7155 assert(size <= INT_MAX);
7156
7157 /* First get the size of the result */
7158 outsize = WideCharToMultiByte(code_page, flags,
7159 p, (int)size,
7160 NULL, 0,
7161 NULL, pusedDefaultChar);
7162 if (outsize <= 0)
7163 goto error;
7164 /* If we used a default char, then we failed! */
7165 if (pusedDefaultChar && *pusedDefaultChar) {
7166 ret = -2;
7167 goto done;
7168 }
7169
7170 if (*outbytes == NULL) {
7171 /* Create string object */
7172 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7173 if (*outbytes == NULL) {
7174 goto done;
7175 }
7176 out = PyBytes_AS_STRING(*outbytes);
7177 }
7178 else {
7179 /* Extend string object */
7180 const Py_ssize_t n = PyBytes_Size(*outbytes);
7181 if (outsize > PY_SSIZE_T_MAX - n) {
7182 PyErr_NoMemory();
7183 goto done;
7184 }
7185 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7186 goto done;
7187 }
7188 out = PyBytes_AS_STRING(*outbytes) + n;
7189 }
7190
7191 /* Do the conversion */
7192 outsize = WideCharToMultiByte(code_page, flags,
7193 p, (int)size,
7194 out, outsize,
7195 NULL, pusedDefaultChar);
7196 if (outsize <= 0)
7197 goto error;
7198 if (pusedDefaultChar && *pusedDefaultChar) {
7199 ret = -2;
7200 goto done;
7201 }
7202 ret = 0;
7203
7204 done:
7205 PyMem_Free(p);
7206 return ret;
7207
7208 error:
7209 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7210 ret = -2;
7211 goto done;
7212 }
7213 PyErr_SetFromWindowsErr(0);
7214 goto done;
7215 }
7216
7217 /*
7218 * Encode a Unicode string to a Windows code page into a byte string using an
7219 * error handler.
7220 *
7221 * Returns consumed characters if succeed, or raise an OSError and returns
7222 * -1 on other error.
7223 */
7224 static int
7225 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7226 PyObject *unicode, Py_ssize_t unicode_offset,
7227 Py_ssize_t insize, const char* errors)
7228 {
7229 const DWORD flags = encode_code_page_flags(code_page, errors);
7230 Py_ssize_t pos = unicode_offset;
7231 Py_ssize_t endin = unicode_offset + insize;
7232 /* Ideally, we should get reason from FormatMessage. This is the Windows
7233 2000 English version of the message. */
7234 const char *reason = "invalid character";
7235 /* 4=maximum length of a UTF-8 sequence */
7236 char buffer[4];
7237 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7238 Py_ssize_t outsize;
7239 char *out;
7240 PyObject *errorHandler = NULL;
7241 PyObject *exc = NULL;
7242 PyObject *encoding_obj = NULL;
7243 const char *encoding;
7244 Py_ssize_t newpos, newoutsize;
7245 PyObject *rep;
7246 int ret = -1;
7247
7248 assert(insize > 0);
7249
7250 encoding = code_page_name(code_page, &encoding_obj);
7251 if (encoding == NULL)
7252 return -1;
7253
7254 if (errors == NULL || strcmp(errors, "strict") == 0) {
7255 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7256 then we raise a UnicodeEncodeError. */
7257 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7258 if (exc != NULL) {
7259 PyCodec_StrictErrors(exc);
7260 Py_DECREF(exc);
7261 }
7262 Py_XDECREF(encoding_obj);
7263 return -1;
7264 }
7265
7266 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7267 pusedDefaultChar = &usedDefaultChar;
7268 else
7269 pusedDefaultChar = NULL;
7270
7271 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7272 PyErr_NoMemory();
7273 goto error;
7274 }
7275 outsize = insize * Py_ARRAY_LENGTH(buffer);
7276
7277 if (*outbytes == NULL) {
7278 /* Create string object */
7279 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7280 if (*outbytes == NULL)
7281 goto error;
7282 out = PyBytes_AS_STRING(*outbytes);
7283 }
7284 else {
7285 /* Extend string object */
7286 Py_ssize_t n = PyBytes_Size(*outbytes);
7287 if (n > PY_SSIZE_T_MAX - outsize) {
7288 PyErr_NoMemory();
7289 goto error;
7290 }
7291 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7292 goto error;
7293 out = PyBytes_AS_STRING(*outbytes) + n;
7294 }
7295
7296 /* Encode the string character per character */
7297 while (pos < endin)
7298 {
7299 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7300 wchar_t chars[2];
7301 int charsize;
7302 if (ch < 0x10000) {
7303 chars[0] = (wchar_t)ch;
7304 charsize = 1;
7305 }
7306 else {
7307 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7308 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7309 charsize = 2;
7310 }
7311
7312 outsize = WideCharToMultiByte(code_page, flags,
7313 chars, charsize,
7314 buffer, Py_ARRAY_LENGTH(buffer),
7315 NULL, pusedDefaultChar);
7316 if (outsize > 0) {
7317 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7318 {
7319 pos++;
7320 memcpy(out, buffer, outsize);
7321 out += outsize;
7322 continue;
7323 }
7324 }
7325 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7326 PyErr_SetFromWindowsErr(0);
7327 goto error;
7328 }
7329
7330 rep = unicode_encode_call_errorhandler(
7331 errors, &errorHandler, encoding, reason,
7332 unicode, &exc,
7333 pos, pos + 1, &newpos);
7334 if (rep == NULL)
7335 goto error;
7336
7337 Py_ssize_t morebytes = pos - newpos;
7338 if (PyBytes_Check(rep)) {
7339 outsize = PyBytes_GET_SIZE(rep);
7340 morebytes += outsize;
7341 if (morebytes > 0) {
7342 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7343 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7344 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7345 Py_DECREF(rep);
7346 goto error;
7347 }
7348 out = PyBytes_AS_STRING(*outbytes) + offset;
7349 }
7350 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7351 out += outsize;
7352 }
7353 else {
7354 Py_ssize_t i;
7355 int kind;
7356 const void *data;
7357
7358 outsize = PyUnicode_GET_LENGTH(rep);
7359 morebytes += outsize;
7360 if (morebytes > 0) {
7361 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7362 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7363 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7364 Py_DECREF(rep);
7365 goto error;
7366 }
7367 out = PyBytes_AS_STRING(*outbytes) + offset;
7368 }
7369 kind = PyUnicode_KIND(rep);
7370 data = PyUnicode_DATA(rep);
7371 for (i=0; i < outsize; i++) {
7372 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7373 if (ch > 127) {
7374 raise_encode_exception(&exc,
7375 encoding, unicode,
7376 pos, pos + 1,
7377 "unable to encode error handler result to ASCII");
7378 Py_DECREF(rep);
7379 goto error;
7380 }
7381 *out = (unsigned char)ch;
7382 out++;
7383 }
7384 }
7385 pos = newpos;
7386 Py_DECREF(rep);
7387 }
7388 /* write a NUL byte */
7389 *out = 0;
7390 outsize = out - PyBytes_AS_STRING(*outbytes);
7391 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7392 if (_PyBytes_Resize(outbytes, outsize) < 0)
7393 goto error;
7394 ret = 0;
7395
7396 error:
7397 Py_XDECREF(encoding_obj);
7398 Py_XDECREF(errorHandler);
7399 Py_XDECREF(exc);
7400 return ret;
7401 }
7402
7403 static PyObject *
7404 encode_code_page(int code_page,
7405 PyObject *unicode,
7406 const char *errors)
7407 {
7408 Py_ssize_t len;
7409 PyObject *outbytes = NULL;
7410 Py_ssize_t offset;
7411 int chunk_len, ret, done;
7412
7413 if (!PyUnicode_Check(unicode)) {
7414 PyErr_BadArgument();
7415 return NULL;
7416 }
7417
7418 len = PyUnicode_GET_LENGTH(unicode);
7419
7420 if (code_page < 0) {
7421 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7422 return NULL;
7423 }
7424
7425 if (len == 0)
7426 return PyBytes_FromStringAndSize(NULL, 0);
7427
7428 offset = 0;
7429 do
7430 {
7431 #ifdef NEED_RETRY
7432 if (len > DECODING_CHUNK_SIZE) {
7433 chunk_len = DECODING_CHUNK_SIZE;
7434 done = 0;
7435 }
7436 else
7437 #endif
7438 {
7439 chunk_len = (int)len;
7440 done = 1;
7441 }
7442
7443 ret = encode_code_page_strict(code_page, &outbytes,
7444 unicode, offset, chunk_len,
7445 errors);
7446 if (ret == -2)
7447 ret = encode_code_page_errors(code_page, &outbytes,
7448 unicode, offset,
7449 chunk_len, errors);
7450 if (ret < 0) {
7451 Py_XDECREF(outbytes);
7452 return NULL;
7453 }
7454
7455 offset += chunk_len;
7456 len -= chunk_len;
7457 } while (!done);
7458
7459 return outbytes;
7460 }
7461
7462 PyObject *
7463 PyUnicode_EncodeCodePage(int code_page,
7464 PyObject *unicode,
7465 const char *errors)
7466 {
7467 return encode_code_page(code_page, unicode, errors);
7468 }
7469
7470 PyObject *
7471 PyUnicode_AsMBCSString(PyObject *unicode)
7472 {
7473 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7474 }
7475
7476 #undef NEED_RETRY
7477
7478 #endif /* MS_WINDOWS */
7479
7480 /* --- Character Mapping Codec -------------------------------------------- */
7481
7482 static int
7483 39426 charmap_decode_string(const char *s,
7484 Py_ssize_t size,
7485 PyObject *mapping,
7486 const char *errors,
7487 _PyUnicodeWriter *writer)
7488 {
7489 39426 const char *starts = s;
7490 const char *e;
7491 Py_ssize_t startinpos, endinpos;
7492 39426 PyObject *errorHandler = NULL, *exc = NULL;
7493 Py_ssize_t maplen;
7494 int mapkind;
7495 const void *mapdata;
7496 Py_UCS4 x;
7497 unsigned char ch;
7498
7499 39426 maplen = PyUnicode_GET_LENGTH(mapping);
7500 39426 mapdata = PyUnicode_DATA(mapping);
7501 39426 mapkind = PyUnicode_KIND(mapping);
7502
7503 39426 e = s + size;
7504
7505
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 39426 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
39426 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7506 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7507 * is disabled in encoding aliases, latin1 is preferred because
7508 * its implementation is faster. */
7509 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
7510 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7511 Py_UCS4 maxchar = writer->maxchar;
7512
7513 assert (writer->kind == PyUnicode_1BYTE_KIND);
7514 while (s < e) {
7515 ch = *s;
7516 x = mapdata_ucs1[ch];
7517 if (x > maxchar) {
7518 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7519 goto onError;
7520 maxchar = writer->maxchar;
7521 outdata = (Py_UCS1 *)writer->data;
7522 }
7523 outdata[writer->pos] = x;
7524 writer->pos++;
7525 ++s;
7526 }
7527 return 0;
7528 }
7529
7530
1/2
✓ Branch 0 taken 39426 times.
✗ Branch 1 not taken.
39426 while (s < e) {
7531
2/4
✓ Branch 0 taken 39426 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 39426 times.
✗ Branch 3 not taken.
39426 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7532 39426 int outkind = writer->kind;
7533 39426 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
7534
1/2
✓ Branch 0 taken 39426 times.
✗ Branch 1 not taken.
39426 if (outkind == PyUnicode_1BYTE_KIND) {
7535 39426 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7536 39426 Py_UCS4 maxchar = writer->maxchar;
7537
2/2
✓ Branch 0 taken 1586232 times.
✓ Branch 1 taken 39426 times.
1625658 while (s < e) {
7538 1586232 ch = *s;
7539 1586232 x = mapdata_ucs2[ch];
7540
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1586232 times.
1586232 if (x > maxchar)
7541 goto Error;
7542 1586232 outdata[writer->pos] = x;
7543 1586232 writer->pos++;
7544 1586232 ++s;
7545 }
7546 39426 break;
7547 }
7548 else if (outkind == PyUnicode_2BYTE_KIND) {
7549 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7550 while (s < e) {
7551 ch = *s;
7552 x = mapdata_ucs2[ch];
7553 if (x == 0xFFFE)
7554 goto Error;
7555 outdata[writer->pos] = x;
7556 writer->pos++;
7557 ++s;
7558 }
7559 break;
7560 }
7561 }
7562 ch = *s;
7563
7564 if (ch < maplen)
7565 x = PyUnicode_READ(mapkind, mapdata, ch);
7566 else
7567 x = 0xfffe; /* invalid value */
7568 Error:
7569 if (x == 0xfffe)
7570 {
7571 /* undefined mapping */
7572 startinpos = s-starts;
7573 endinpos = startinpos+1;
7574 if (unicode_decode_call_errorhandler_writer(
7575 errors, &errorHandler,
7576 "charmap", "character maps to <undefined>",
7577 &starts, &e, &startinpos, &endinpos, &exc, &s,
7578 writer)) {
7579 goto onError;
7580 }
7581 continue;
7582 }
7583
7584 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7585 goto onError;
7586 ++s;
7587 }
7588 39426 Py_XDECREF(errorHandler);
7589 39426 Py_XDECREF(exc);
7590 39426 return 0;
7591
7592 onError:
7593 Py_XDECREF(errorHandler);
7594 Py_XDECREF(exc);
7595 return -1;
7596 }
7597
7598 static int
7599 charmap_decode_mapping(const char *s,
7600 Py_ssize_t size,
7601 PyObject *mapping,
7602 const char *errors,
7603 _PyUnicodeWriter *writer)
7604 {
7605 const char *starts = s;
7606 const char *e;
7607 Py_ssize_t startinpos, endinpos;
7608 PyObject *errorHandler = NULL, *exc = NULL;
7609 unsigned char ch;
7610 PyObject *key, *item = NULL;
7611
7612 e = s + size;
7613
7614 while (s < e) {
7615 ch = *s;
7616
7617 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7618 key = PyLong_FromLong((long)ch);
7619 if (key == NULL)
7620 goto onError;
7621
7622 item = PyObject_GetItem(mapping, key);
7623 Py_DECREF(key);
7624 if (item == NULL) {
7625 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7626 /* No mapping found means: mapping is undefined. */
7627 PyErr_Clear();
7628 goto Undefined;
7629 } else
7630 goto onError;
7631 }
7632
7633 /* Apply mapping */
7634 if (item == Py_None)
7635 goto Undefined;
7636 if (PyLong_Check(item)) {
7637 long value = PyLong_AS_LONG(item);
7638 if (value == 0xFFFE)
7639 goto Undefined;
7640 if (value < 0 || value > MAX_UNICODE) {
7641 PyErr_Format(PyExc_TypeError,
7642 "character mapping must be in range(0x%x)",
7643 (unsigned long)MAX_UNICODE + 1);
7644 goto onError;
7645 }
7646
7647 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7648 goto onError;
7649 }
7650 else if (PyUnicode_Check(item)) {
7651 if (PyUnicode_GET_LENGTH(item) == 1) {
7652 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7653 if (value == 0xFFFE)
7654 goto Undefined;
7655 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7656 goto onError;
7657 }
7658 else {
7659 writer->overallocate = 1;
7660 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7661 goto onError;
7662 }
7663 }
7664 else {
7665 /* wrong return value */
7666 PyErr_SetString(PyExc_TypeError,
7667 "character mapping must return integer, None or str");
7668 goto onError;
7669 }
7670 Py_CLEAR(item);
7671 ++s;
7672 continue;
7673
7674 Undefined:
7675 /* undefined mapping */
7676 Py_CLEAR(item);
7677 startinpos = s-starts;
7678 endinpos = startinpos+1;
7679 if (unicode_decode_call_errorhandler_writer(
7680 errors, &errorHandler,
7681 "charmap", "character maps to <undefined>",
7682 &starts, &e, &startinpos, &endinpos, &exc, &s,
7683 writer)) {
7684 goto onError;
7685 }
7686 }
7687 Py_XDECREF(errorHandler);
7688 Py_XDECREF(exc);
7689 return 0;
7690
7691 onError:
7692 Py_XDECREF(item);
7693 Py_XDECREF(errorHandler);
7694 Py_XDECREF(exc);
7695 return -1;
7696 }
7697
7698 PyObject *
7699 39426 PyUnicode_DecodeCharmap(const char *s,
7700 Py_ssize_t size,
7701 PyObject *mapping,
7702 const char *errors)
7703 {
7704 _PyUnicodeWriter writer;
7705
7706 /* Default to Latin-1 */
7707
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 39426 times.
39426 if (mapping == NULL)
7708 return PyUnicode_DecodeLatin1(s, size, errors);
7709
7710
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 39426 times.
39426 if (size == 0)
7711 _Py_RETURN_UNICODE_EMPTY();
7712 39426 _PyUnicodeWriter_Init(&writer);
7713 39426 writer.min_length = size;
7714
3/8
✗ Branch 0 not taken.
✓ Branch 1 taken 39426 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 39426 times.
✗ Branch 5 not taken.
✗ Branch 7 not taken.
✓ Branch 8 taken 39426 times.
39426 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
7715 goto onError;
7716
7717
1/2
✓ Branch 1 taken 39426 times.
✗ Branch 2 not taken.
39426 if (PyUnicode_CheckExact(mapping)) {
7718
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 39426 times.
39426 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7719 goto onError;
7720 }
7721 else {
7722 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7723 goto onError;
7724 }
7725 39426 return _PyUnicodeWriter_Finish(&writer);
7726
7727 onError:
7728 _PyUnicodeWriter_Dealloc(&writer);
7729 return NULL;
7730 }
7731
7732 /* Charmap encoding: the lookup table */
7733
7734 /*[clinic input]
7735 class EncodingMap "struct encoding_map *" "&EncodingMapType"
7736 [clinic start generated code]*/
7737 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
7738
7739 struct encoding_map {
7740 PyObject_HEAD
7741 unsigned char level1[32];
7742 int count2, count3;
7743 unsigned char level23[1];
7744 };
7745
7746 /*[clinic input]
7747 EncodingMap.size
7748
7749 Return the size (in bytes) of this object.
7750 [clinic start generated code]*/
7751
7752 static PyObject *
7753 EncodingMap_size_impl(struct encoding_map *self)
7754 /*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
7755 {
7756 return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
7757 128*self->count3);
7758 }
7759
7760 static PyMethodDef encoding_map_methods[] = {
7761 ENCODINGMAP_SIZE_METHODDEF
7762 {NULL, NULL}
7763 };
7764
7765 static PyTypeObject EncodingMapType = {
7766 PyVarObject_HEAD_INIT(NULL, 0)
7767 .tp_name = "EncodingMap",
7768 .tp_basicsize = sizeof(struct encoding_map),
7769 /* methods */
7770 .tp_flags = Py_TPFLAGS_DEFAULT,
7771 .tp_methods = encoding_map_methods,
7772 };
7773
7774 PyObject*
7775 PyUnicode_BuildEncodingMap(PyObject* string)
7776 {
7777 PyObject *result;
7778 struct encoding_map *mresult;
7779 int i;
7780 int need_dict = 0;
7781 unsigned char level1[32];
7782 unsigned char level2[512];
7783 unsigned char *mlevel1, *mlevel2, *mlevel3;
7784 int count2 = 0, count3 = 0;
7785 int kind;
7786 const void *data;
7787 Py_ssize_t length;
7788 Py_UCS4 ch;
7789
7790 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
7791 PyErr_BadArgument();
7792 return NULL;
7793 }
7794 kind = PyUnicode_KIND(string);
7795 data = PyUnicode_DATA(string);
7796 length = PyUnicode_GET_LENGTH(string);
7797 length = Py_MIN(length, 256);
7798 memset(level1, 0xFF, sizeof level1);
7799 memset(level2, 0xFF, sizeof level2);
7800
7801 /* If there isn't a one-to-one mapping of NULL to \0,
7802 or if there are non-BMP characters, we need to use
7803 a mapping dictionary. */
7804 if (PyUnicode_READ(kind, data, 0) != 0)
7805 need_dict = 1;
7806 for (i = 1; i < length; i++) {
7807 int l1, l2;
7808 ch = PyUnicode_READ(kind, data, i);
7809 if (ch == 0 || ch > 0xFFFF) {
7810 need_dict = 1;
7811 break;
7812 }
7813 if (ch == 0xFFFE)
7814 /* unmapped character */
7815 continue;
7816 l1 = ch >> 11;
7817 l2 = ch >> 7;
7818 if (level1[l1] == 0xFF)
7819 level1[l1] = count2++;
7820 if (level2[l2] == 0xFF)
7821 level2[l2] = count3++;
7822 }
7823
7824 if (count2 >= 0xFF || count3 >= 0xFF)
7825 need_dict = 1;
7826
7827 if (need_dict) {
7828 PyObject *result = PyDict_New();
7829 PyObject *key, *value;
7830 if (!result)
7831 return NULL;
7832 for (i = 0; i < length; i++) {
7833 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7834 value = PyLong_FromLong(i);
7835 if (!key || !value)
7836 goto failed1;
7837 if (PyDict_SetItem(result, key, value) == -1)
7838 goto failed1;
7839 Py_DECREF(key);
7840 Py_DECREF(value);
7841 }
7842 return result;
7843 failed1:
7844 Py_XDECREF(key);
7845 Py_XDECREF(value);
7846 Py_DECREF(result);
7847 return NULL;
7848 }
7849
7850 /* Create a three-level trie */
7851 result = PyObject_Malloc(sizeof(struct encoding_map) +
7852 16*count2 + 128*count3 - 1);
7853 if (!result) {
7854 return PyErr_NoMemory();
7855 }
7856
7857 _PyObject_Init(result, &EncodingMapType);
7858 mresult = (struct encoding_map*)result;
7859 mresult->count2 = count2;
7860 mresult->count3 = count3;
7861 mlevel1 = mresult->level1;
7862 mlevel2 = mresult->level23;
7863 mlevel3 = mresult->level23 + 16*count2;
7864 memcpy(mlevel1, level1, 32);
7865 memset(mlevel2, 0xFF, 16*count2);
7866 memset(mlevel3, 0, 128*count3);
7867 count3 = 0;
7868 for (i = 1; i < length; i++) {
7869 int o1, o2, o3, i2, i3;
7870 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7871 if (ch == 0xFFFE)
7872 /* unmapped character */
7873 continue;
7874 o1 = ch>>11;
7875 o2 = (ch>>7) & 0xF;
7876 i2 = 16*mlevel1[o1] + o2;
7877 if (mlevel2[i2] == 0xFF)
7878 mlevel2[i2] = count3++;
7879 o3 = ch & 0x7F;
7880 i3 = 128*mlevel2[i2] + o3;
7881 mlevel3[i3] = i;
7882 }
7883 return result;
7884 }
7885
7886 static int
7887 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7888 {
7889 struct encoding_map *map = (struct encoding_map*)mapping;
7890 int l1 = c>>11;
7891 int l2 = (c>>7) & 0xF;
7892 int l3 = c & 0x7F;
7893 int i;
7894
7895 if (c > 0xFFFF)
7896 return -1;
7897 if (c == 0)
7898 return 0;
7899 /* level 1*/
7900 i = map->level1[l1];
7901 if (i == 0xFF) {
7902 return -1;
7903 }
7904 /* level 2*/
7905 i = map->level23[16*i+l2];
7906 if (i == 0xFF) {
7907 return -1;
7908 }
7909 /* level 3 */
7910 i = map->level23[16*map->count2 + 128*i + l3];
7911 if (i == 0) {
7912 return -1;
7913 }
7914 return i;
7915 }
7916
7917 /* Lookup the character ch in the mapping. If the character
7918 can't be found, Py_None is returned (or NULL, if another
7919 error occurred). */
7920 static PyObject *
7921 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
7922 {
7923 PyObject *w = PyLong_FromLong((long)c);
7924 PyObject *x;
7925
7926 if (w == NULL)
7927 return NULL;
7928 x = PyObject_GetItem(mapping, w);
7929 Py_DECREF(w);
7930 if (x == NULL) {
7931 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7932 /* No mapping found means: mapping is undefined. */
7933 PyErr_Clear();
7934 Py_RETURN_NONE;
7935 } else
7936 return NULL;
7937 }
7938 else if (x == Py_None)
7939 return x;
7940 else if (PyLong_Check(x)) {
7941 long value = PyLong_AS_LONG(x);
7942 if (value < 0 || value > 255) {
7943 PyErr_SetString(PyExc_TypeError,
7944 "character mapping must be in range(256)");
7945 Py_DECREF(x);
7946 return NULL;
7947 }
7948 return x;
7949 }
7950 else if (PyBytes_Check(x))
7951 return x;
7952 else {
7953 /* wrong return value */
7954 PyErr_Format(PyExc_TypeError,
7955 "character mapping must return integer, bytes or None, not %.400s",
7956 Py_TYPE(x)->tp_name);
7957 Py_DECREF(x);
7958 return NULL;
7959 }
7960 }
7961
7962 static int
7963 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7964 {
7965 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7966 /* exponentially overallocate to minimize reallocations */
7967 if (requiredsize < 2*outsize)
7968 requiredsize = 2*outsize;
7969 if (_PyBytes_Resize(outobj, requiredsize))
7970 return -1;
7971 return 0;
7972 }
7973
7974 typedef enum charmapencode_result {
7975 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7976 } charmapencode_result;
7977 /* lookup the character, put the result in the output string and adjust
7978 various state variables. Resize the output bytes object if not enough
7979 space is available. Return a new reference to the object that
7980 was put in the output buffer, or Py_None, if the mapping was undefined
7981 (in which case no character was written) or NULL, if a
7982 reallocation error occurred. The caller must decref the result */
7983 static charmapencode_result
7984 charmapencode_output(Py_UCS4 c, PyObject *mapping,
7985 PyObject **outobj, Py_ssize_t *outpos)
7986 {
7987 PyObject *rep;
7988 char *outstart;
7989 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7990
7991 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
7992 int res = encoding_map_lookup(c, mapping);
7993 Py_ssize_t requiredsize = *outpos+1;
7994 if (res == -1)
7995 return enc_FAILED;
7996 if (outsize<requiredsize)
7997 if (charmapencode_resize(outobj, outpos, requiredsize))
7998 return enc_EXCEPTION;
7999 outstart = PyBytes_AS_STRING(*outobj);
8000 outstart[(*outpos)++] = (char)res;
8001 return enc_SUCCESS;
8002 }
8003
8004 rep = charmapencode_lookup(c, mapping);
8005 if (rep==NULL)
8006 return enc_EXCEPTION;
8007 else if (rep==Py_None) {
8008 Py_DECREF(rep);
8009 return enc_FAILED;
8010 } else {
8011 if (PyLong_Check(rep)) {
8012 Py_ssize_t requiredsize = *outpos+1;
8013 if (outsize<requiredsize)
8014 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8015 Py_DECREF(rep);
8016 return enc_EXCEPTION;
8017 }
8018 outstart = PyBytes_AS_STRING(*outobj);
8019 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8020 }
8021 else {
8022 const char *repchars = PyBytes_AS_STRING(rep);
8023 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8024 Py_ssize_t requiredsize = *outpos+repsize;
8025 if (outsize<requiredsize)
8026 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8027 Py_DECREF(rep);
8028 return enc_EXCEPTION;
8029 }
8030 outstart = PyBytes_AS_STRING(*outobj);
8031 memcpy(outstart + *outpos, repchars, repsize);
8032 *outpos += repsize;
8033 }
8034 }
8035 Py_DECREF(rep);
8036 return enc_SUCCESS;
8037 }
8038
8039 /* handle an error in PyUnicode_EncodeCharmap
8040 Return 0 on success, -1 on error */
8041 static int
8042 charmap_encoding_error(
8043 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8044 PyObject **exceptionObject,
8045 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8046 PyObject **res, Py_ssize_t *respos)
8047 {
8048 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8049 Py_ssize_t size, repsize;
8050 Py_ssize_t newpos;
8051 int kind;
8052 const void *data;
8053 Py_ssize_t index;
8054 /* startpos for collecting unencodable chars */
8055 Py_ssize_t collstartpos = *inpos;
8056 Py_ssize_t collendpos = *inpos+1;
8057 Py_ssize_t collpos;
8058 const char *encoding = "charmap";
8059 const char *reason = "character maps to <undefined>";
8060 charmapencode_result x;
8061 Py_UCS4 ch;
8062 int val;
8063
8064 size = PyUnicode_GET_LENGTH(unicode);
8065 /* find all unencodable characters */
8066 while (collendpos < size) {
8067 PyObject *rep;
8068 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8069 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8070 val = encoding_map_lookup(ch, mapping);
8071 if (val != -1)
8072 break;
8073 ++collendpos;
8074 continue;
8075 }
8076
8077 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8078 rep = charmapencode_lookup(ch, mapping);
8079 if (rep==NULL)
8080 return -1;
8081 else if (rep!=Py_None) {
8082 Py_DECREF(rep);
8083 break;
8084 }
8085 Py_DECREF(rep);
8086 ++collendpos;
8087 }
8088 /* cache callback name lookup
8089 * (if not done yet, i.e. it's the first error) */
8090 if (*error_handler == _Py_ERROR_UNKNOWN)
8091 *error_handler = _Py_GetErrorHandler(errors);
8092
8093 switch (*error_handler) {
8094 case _Py_ERROR_STRICT:
8095 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8096 return -1;
8097
8098 case _Py_ERROR_REPLACE:
8099 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8100 x = charmapencode_output('?', mapping, res, respos);
8101 if (x==enc_EXCEPTION) {
8102 return -1;
8103 }
8104 else if (x==enc_FAILED) {
8105 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8106 return -1;
8107 }
8108 }
8109 /* fall through */
8110 case _Py_ERROR_IGNORE:
8111 *inpos = collendpos;
8112 break;
8113
8114 case _Py_ERROR_XMLCHARREFREPLACE:
8115 /* generate replacement (temporarily (mis)uses p) */
8116 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8117 char buffer[2+29+1+1];
8118 char *cp;
8119 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8120 for (cp = buffer; *cp; ++cp) {
8121 x = charmapencode_output(*cp, mapping, res, respos);
8122 if (x==enc_EXCEPTION)
8123 return -1;
8124 else if (x==enc_FAILED) {
8125 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8126 return -1;
8127 }
8128 }
8129 }
8130 *inpos = collendpos;
8131 break;
8132
8133 default:
8134 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8135 encoding, reason, unicode, exceptionObject,
8136 collstartpos, collendpos, &newpos);
8137 if (repunicode == NULL)
8138 return -1;
8139 if (PyBytes_Check(repunicode)) {
8140 /* Directly copy bytes result to output. */
8141 Py_ssize_t outsize = PyBytes_Size(*res);
8142 Py_ssize_t requiredsize;
8143 repsize = PyBytes_Size(repunicode);
8144 requiredsize = *respos + repsize;
8145 if (requiredsize > outsize)
8146 /* Make room for all additional bytes. */
8147 if (charmapencode_resize(res, respos, requiredsize)) {
8148 Py_DECREF(repunicode);
8149 return -1;
8150 }
8151 memcpy(PyBytes_AsString(*res) + *respos,
8152 PyBytes_AsString(repunicode), repsize);
8153 *respos += repsize;
8154 *inpos = newpos;
8155 Py_DECREF(repunicode);
8156 break;
8157 }
8158 /* generate replacement */
8159 repsize = PyUnicode_GET_LENGTH(repunicode);
8160 data = PyUnicode_DATA(repunicode);
8161 kind = PyUnicode_KIND(repunicode);
8162 for (index = 0; index < repsize; index++) {
8163 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8164 x = charmapencode_output(repch, mapping, res, respos);
8165 if (x==enc_EXCEPTION) {
8166 Py_DECREF(repunicode);
8167 return -1;
8168 }
8169 else if (x==enc_FAILED) {
8170 Py_DECREF(repunicode);
8171 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8172 return -1;
8173 }
8174 }
8175 *inpos = newpos;
8176 Py_DECREF(repunicode);
8177 }
8178 return 0;
8179 }
8180
8181 PyObject *
8182 _PyUnicode_EncodeCharmap(PyObject *unicode,
8183 PyObject *mapping,
8184 const char *errors)
8185 {
8186 /* output object */
8187 PyObject *res = NULL;
8188 /* current input position */
8189 Py_ssize_t inpos = 0;
8190 Py_ssize_t size;
8191 /* current output position */
8192 Py_ssize_t respos = 0;
8193 PyObject *error_handler_obj = NULL;
8194 PyObject *exc = NULL;
8195 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8196 const void *data;
8197 int kind;
8198
8199 size = PyUnicode_GET_LENGTH(unicode);
8200 data = PyUnicode_DATA(unicode);
8201 kind = PyUnicode_KIND(unicode);
8202
8203 /* Default to Latin-1 */
8204 if (mapping == NULL)
8205 return unicode_encode_ucs1(unicode, errors, 256);
8206
8207 /* allocate enough for a simple encoding without
8208 replacements, if we need more, we'll resize */
8209 res = PyBytes_FromStringAndSize(NULL, size);
8210 if (res == NULL)
8211 goto onError;
8212 if (size == 0)
8213 return res;
8214
8215 while (inpos<size) {
8216 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8217 /* try to encode it */
8218 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8219 if (x==enc_EXCEPTION) /* error */
8220 goto onError;
8221 if (x==enc_FAILED) { /* unencodable character */
8222 if (charmap_encoding_error(unicode, &inpos, mapping,
8223 &exc,
8224 &error_handler, &error_handler_obj, errors,
8225 &res, &respos)) {
8226 goto onError;
8227 }
8228 }
8229 else
8230 /* done with this character => adjust input position */
8231 ++inpos;
8232 }
8233
8234 /* Resize if we allocated to much */
8235 if (respos<PyBytes_GET_SIZE(res))
8236 if (_PyBytes_Resize(&res, respos) < 0)
8237 goto onError;
8238
8239 Py_XDECREF(exc);
8240 Py_XDECREF(error_handler_obj);
8241 return res;
8242
8243 onError:
8244 Py_XDECREF(res);
8245 Py_XDECREF(exc);
8246 Py_XDECREF(error_handler_obj);
8247 return NULL;
8248 }
8249
8250 PyObject *
8251 PyUnicode_AsCharmapString(PyObject *unicode,
8252 PyObject *mapping)
8253 {
8254 if (!PyUnicode_Check(unicode) || mapping == NULL) {
8255 PyErr_BadArgument();
8256 return NULL;
8257 }
8258 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8259 }
8260
8261 /* create or adjust a UnicodeTranslateError */
8262 static void
8263 make_translate_exception(PyObject **exceptionObject,
8264 PyObject *unicode,
8265 Py_ssize_t startpos, Py_ssize_t endpos,
8266 const char *reason)
8267 {
8268 if (*exceptionObject == NULL) {
8269 *exceptionObject = _PyUnicodeTranslateError_Create(
8270 unicode, startpos, endpos, reason);
8271 }
8272 else {
8273 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8274 goto onError;
8275 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8276 goto onError;
8277 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8278 goto onError;
8279 return;
8280 onError:
8281 Py_CLEAR(*exceptionObject);
8282 }
8283 }
8284
8285 /* error handling callback helper:
8286 build arguments, call the callback and check the arguments,
8287 put the result into newpos and return the replacement string, which
8288 has to be freed by the caller */
8289 static PyObject *
8290 unicode_translate_call_errorhandler(const char *errors,
8291 PyObject **errorHandler,
8292 const char *reason,
8293 PyObject *unicode, PyObject **exceptionObject,
8294 Py_ssize_t startpos, Py_ssize_t endpos,
8295 Py_ssize_t *newpos)
8296 {
8297 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8298
8299 Py_ssize_t i_newpos;
8300 PyObject *restuple;
8301 PyObject *resunicode;
8302
8303 if (*errorHandler == NULL) {
8304 *errorHandler = PyCodec_LookupError(errors);
8305 if (*errorHandler == NULL)
8306 return NULL;
8307 }
8308
8309 make_translate_exception(exceptionObject,
8310 unicode, startpos, endpos, reason);
8311 if (*exceptionObject == NULL)
8312 return NULL;
8313
8314 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
8315 if (restuple == NULL)
8316 return NULL;
8317 if (!PyTuple_Check(restuple)) {
8318 PyErr_SetString(PyExc_TypeError, &argparse[3]);
8319 Py_DECREF(restuple);
8320 return NULL;
8321 }
8322 if (!PyArg_ParseTuple(restuple, argparse,
8323 &resunicode, &i_newpos)) {
8324 Py_DECREF(restuple);
8325 return NULL;
8326 }
8327 if (i_newpos<0)
8328 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8329 else
8330 *newpos = i_newpos;
8331 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8332 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8333 Py_DECREF(restuple);
8334 return NULL;
8335 }
8336 Py_INCREF(resunicode);
8337 Py_DECREF(restuple);
8338 return resunicode;
8339 }
8340
8341 /* Lookup the character ch in the mapping and put the result in result,
8342 which must be decrefed by the caller.
8343 Return 0 on success, -1 on error */
8344 static int
8345 2089513 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8346 {
8347 2089513 PyObject *w = PyLong_FromLong((long)c);
8348 PyObject *x;
8349
8350
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2089513 times.
2089513 if (w == NULL)
8351 return -1;
8352 2089513 x = PyObject_GetItem(mapping, w);
8353 2089513 Py_DECREF(w);
8354
2/2
✓ Branch 0 taken 1965463 times.
✓ Branch 1 taken 124050 times.
2089513 if (x == NULL) {
8355
1/2
✓ Branch 1 taken 1965463 times.
✗ Branch 2 not taken.
1965463 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8356 /* No mapping found means: use 1:1 mapping. */
8357 1965463 PyErr_Clear();
8358 1965463 *result = NULL;
8359 1965463 return 0;
8360 } else
8361 return -1;
8362 }
8363
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 124050 times.
124050 else if (x == Py_None) {
8364 *result = x;
8365 return 0;
8366 }
8367
2/2
✓ Branch 2 taken 1579 times.
✓ Branch 3 taken 122471 times.
124050 else if (PyLong_Check(x)) {
8368 1579 long value = PyLong_AS_LONG(x);
8369
2/4
✓ Branch 0 taken 1579 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1579 times.
1579 if (value < 0 || value > MAX_UNICODE) {
8370 PyErr_Format(PyExc_ValueError,
8371 "character mapping must be in range(0x%x)",
8372 MAX_UNICODE+1);
8373 Py_DECREF(x);
8374 return -1;
8375 }
8376 1579 *result = x;
8377 1579 return 0;
8378 }
8379
1/2
✓ Branch 2 taken 122471 times.
✗ Branch 3 not taken.
122471 else if (PyUnicode_Check(x)) {
8380 122471 *result = x;
8381 122471 return 0;
8382 }
8383 else {
8384 /* wrong return value */
8385 PyErr_SetString(PyExc_TypeError,
8386 "character mapping must return integer, None or str");
8387 Py_DECREF(x);
8388 return -1;
8389 }
8390 }
8391
8392 /* lookup the character, write the result into the writer.
8393 Return 1 if the result was written into the writer, return 0 if the mapping
8394 was undefined, raise an exception return -1 on error. */
8395 static int
8396 89897 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8397 _PyUnicodeWriter *writer)
8398 {
8399 PyObject *item;
8400
8401
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 89897 times.
89897 if (charmaptranslate_lookup(ch, mapping, &item))
8402 return -1;
8403
8404
2/2
✓ Branch 0 taken 18062 times.
✓ Branch 1 taken 71835 times.
89897 if (item == NULL) {
8405 /* not found => default to 1:1 mapping */
8406
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 18062 times.
18062 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8407 return -1;
8408 }
8409 18062 return 1;
8410 }
8411
8412
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 71835 times.
71835 if (item == Py_None) {
8413 Py_DECREF(item);
8414 return 0;
8415 }
8416
8417
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 71835 times.
71835 if (PyLong_Check(item)) {
8418 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8419 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8420 used it */
8421 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8422 Py_DECREF(item);
8423 return -1;
8424 }
8425 Py_DECREF(item);
8426 return 1;
8427 }
8428
8429
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 71835 times.
71835 if (!PyUnicode_Check(item)) {
8430 Py_DECREF(item);
8431 return -1;
8432 }
8433
8434
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 71835 times.
71835 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8435 Py_DECREF(item);
8436 return -1;
8437 }
8438
8439 71835 Py_DECREF(item);
8440 71835 return 1;
8441 }
8442
8443 static int
8444 1999616 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8445 Py_UCS1 *translate)
8446 {
8447 1999616 PyObject *item = NULL;
8448 1999616 int ret = 0;
8449
8450
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 1999616 times.
1999616 if (charmaptranslate_lookup(ch, mapping, &item)) {
8451 return -1;
8452 }
8453
8454
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1999616 times.
1999616 if (item == Py_None) {
8455 /* deletion */
8456 translate[ch] = 0xfe;
8457 }
8458
2/2
✓ Branch 0 taken 1947401 times.
✓ Branch 1 taken 52215 times.
1999616 else if (item == NULL) {
8459 /* not found => default to 1:1 mapping */
8460 1947401 translate[ch] = ch;
8461 1947401 return 1;
8462 }
8463
2/2
✓ Branch 2 taken 1579 times.
✓ Branch 3 taken 50636 times.
52215 else if (PyLong_Check(item)) {
8464 1579 long replace = PyLong_AS_LONG(item);
8465 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8466 used it */
8467
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1579 times.
1579 if (127 < replace) {
8468 /* invalid character or character outside ASCII:
8469 skip the fast translate */
8470 goto exit;
8471 }
8472 1579 translate[ch] = (Py_UCS1)replace;
8473 }
8474
1/2
✓ Branch 2 taken 50636 times.
✗ Branch 3 not taken.
50636 else if (PyUnicode_Check(item)) {
8475 Py_UCS4 replace;
8476
8477
1/2
✓ Branch 1 taken 50636 times.
✗ Branch 2 not taken.
50636 if (PyUnicode_GET_LENGTH(item) != 1)
8478 50636 goto exit;
8479
8480 replace = PyUnicode_READ_CHAR(item, 0);
8481 if (replace > 127)
8482 goto exit;
8483 translate[ch] = (Py_UCS1)replace;
8484 }
8485 else {
8486 /* not None, NULL, long or unicode */
8487 goto exit;
8488 }
8489 1579 ret = 1;
8490
8491 52215 exit:
8492 52215 Py_DECREF(item);
8493 52215 return ret;
8494 }
8495
8496 /* Fast path for ascii => ascii translation. Return 1 if the whole string
8497 was translated into writer, return 0 if the input string was partially
8498 translated into writer, raise an exception and return -1 on error. */
8499 static int
8500 720037 unicode_fast_translate(PyObject *input, PyObject *mapping,
8501 _PyUnicodeWriter *writer, int ignore,
8502 Py_ssize_t *input_pos)
8503 {
8504 Py_UCS1 ascii_table[128], ch, ch2;
8505 Py_ssize_t len;
8506 const Py_UCS1 *in, *end;
8507 Py_UCS1 *out;
8508 720037 int res = 0;
8509
8510 720037 len = PyUnicode_GET_LENGTH(input);
8511
8512 720037 memset(ascii_table, 0xff, 128);
8513
8514 720037 in = PyUnicode_1BYTE_DATA(input);
8515 720037 end = in + len;
8516
8517 assert(PyUnicode_IS_ASCII(writer->buffer));
8518 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8519 720037 out = PyUnicode_1BYTE_DATA(writer->buffer);
8520
8521
2/2
✓ Branch 0 taken 2230871 times.
✓ Branch 1 taken 669401 times.
2900272 for (; in < end; in++) {
8522 2230871 ch = *in;
8523 2230871 ch2 = ascii_table[ch];
8524
2/2
✓ Branch 0 taken 1999616 times.
✓ Branch 1 taken 231255 times.
2230871 if (ch2 == 0xff) {
8525 1999616 int translate = unicode_fast_translate_lookup(mapping, ch,
8526 ascii_table);
8527
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1999616 times.
1999616 if (translate < 0)
8528 return -1;
8529
2/2
✓ Branch 0 taken 50636 times.
✓ Branch 1 taken 1948980 times.
1999616 if (translate == 0)
8530 50636 goto exit;
8531 1948980 ch2 = ascii_table[ch];
8532 }
8533
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2180235 times.
2180235 if (ch2 == 0xfe) {
8534 if (ignore)
8535 continue;
8536 goto exit;
8537 }
8538 assert(ch2 < 128);
8539 2180235 *out = ch2;
8540 2180235 out++;
8541 }
8542 669401 res = 1;
8543
8544 720037 exit:
8545 720037 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8546 720037 *input_pos = in - PyUnicode_1BYTE_DATA(input);
8547 720037 return res;
8548 }
8549
8550 static PyObject *
8551 720739 _PyUnicode_TranslateCharmap(PyObject *input,
8552 PyObject *mapping,
8553 const char *errors)
8554 {
8555 /* input object */
8556 const void *data;
8557 Py_ssize_t size, i;
8558 int kind;
8559 /* output buffer */
8560 _PyUnicodeWriter writer;
8561 /* error handler */
8562 720739 const char *reason = "character maps to <undefined>";
8563 720739 PyObject *errorHandler = NULL;
8564 720739 PyObject *exc = NULL;
8565 int ignore;
8566 int res;
8567
8568
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 720739 times.
720739 if (mapping == NULL) {
8569 PyErr_BadArgument();
8570 return NULL;
8571 }
8572
8573 720739 data = PyUnicode_DATA(input);
8574 720739 kind = PyUnicode_KIND(input);
8575 720739 size = PyUnicode_GET_LENGTH(input);
8576
8577
2/2
✓ Branch 0 taken 702 times.
✓ Branch 1 taken 720037 times.
720739 if (size == 0)
8578 702 return PyUnicode_FromObject(input);
8579
8580 /* allocate enough for a simple 1:1 translation without
8581 replacements, if we need more, we'll resize */
8582 720037 _PyUnicodeWriter_Init(&writer);
8583
3/8
✗ Branch 0 not taken.
✓ Branch 1 taken 720037 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 720037 times.
✗ Branch 5 not taken.
✗ Branch 7 not taken.
✓ Branch 8 taken 720037 times.
720037 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8584 goto onError;
8585
8586
2/4
✓ Branch 0 taken 720037 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 720037 times.
✗ Branch 3 not taken.
720037 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8587
8588
1/2
✓ Branch 1 taken 720037 times.
✗ Branch 2 not taken.
720037 if (PyUnicode_IS_ASCII(input)) {
8589 720037 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8590
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 720037 times.
720037 if (res < 0) {
8591 _PyUnicodeWriter_Dealloc(&writer);
8592 return NULL;
8593 }
8594
2/2
✓ Branch 0 taken 669401 times.
✓ Branch 1 taken 50636 times.
720037 if (res == 1)
8595 669401 return _PyUnicodeWriter_Finish(&writer);
8596 }
8597 else {
8598 i = 0;
8599 }
8600
8601
2/2
✓ Branch 0 taken 89897 times.
✓ Branch 1 taken 50636 times.
140533 while (i<size) {
8602 /* try to encode it */
8603 int translate;
8604 89897 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8605 Py_ssize_t newpos;
8606 /* startpos for collecting untranslatable chars */
8607 Py_ssize_t collstart;
8608 Py_ssize_t collend;
8609 Py_UCS4 ch;
8610
8611 89897 ch = PyUnicode_READ(kind, data, i);
8612 89897 translate = charmaptranslate_output(ch, mapping, &writer);
8613
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 89897 times.
89897 if (translate < 0)
8614 goto onError;
8615
8616
1/2
✓ Branch 0 taken 89897 times.
✗ Branch 1 not taken.
89897 if (translate != 0) {
8617 /* it worked => adjust input pointer */
8618 89897 ++i;
8619 89897 continue;
8620 }
8621
8622 /* untranslatable character */
8623 collstart = i;
8624 collend = i+1;
8625
8626 /* find all untranslatable characters */
8627 while (collend < size) {
8628 PyObject *x;
8629 ch = PyUnicode_READ(kind, data, collend);
8630 if (charmaptranslate_lookup(ch, mapping, &x))
8631 goto onError;
8632 Py_XDECREF(x);
8633 if (x != Py_None)
8634 break;
8635 ++collend;
8636 }
8637
8638 if (ignore) {
8639 i = collend;
8640 }
8641 else {
8642 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8643 reason, input, &exc,
8644 collstart, collend, &newpos);
8645 if (repunicode == NULL)
8646 goto onError;
8647 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
8648 Py_DECREF(repunicode);
8649 goto onError;
8650 }
8651 Py_DECREF(repunicode);
8652 i = newpos;
8653 }
8654 }
8655 50636 Py_XDECREF(exc);
8656 50636 Py_XDECREF(errorHandler);
8657 50636 return _PyUnicodeWriter_Finish(&writer);
8658
8659 onError:
8660 _PyUnicodeWriter_Dealloc(&writer);
8661 Py_XDECREF(exc);
8662 Py_XDECREF(errorHandler);
8663 return NULL;
8664 }
8665
8666 PyObject *
8667 PyUnicode_Translate(PyObject *str,
8668 PyObject *mapping,
8669 const char *errors)
8670 {
8671 if (ensure_unicode(str) < 0)
8672 return NULL;
8673 return _PyUnicode_TranslateCharmap(str, mapping, errors);
8674 }
8675
8676 PyObject *
8677 710767 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8678 {
8679
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 710767 times.
710767 if (!PyUnicode_Check(unicode)) {
8680 PyErr_BadInternalCall();
8681 return NULL;
8682 }
8683
1/2
✓ Branch 1 taken 710767 times.
✗ Branch 2 not taken.
710767 if (PyUnicode_IS_ASCII(unicode)) {
8684 /* If the string is already ASCII, just return the same string */
8685 710767 Py_INCREF(unicode);
8686 710767 return unicode;
8687 }
8688
8689 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
8690 PyObject *result = PyUnicode_New(len, 127);
8691 if (result == NULL) {
8692 return NULL;
8693 }
8694
8695 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
8696 int kind = PyUnicode_KIND(unicode);
8697 const void *data = PyUnicode_DATA(unicode);
8698 Py_ssize_t i;
8699 for (i = 0; i < len; ++i) {
8700 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8701 if (ch < 127) {
8702 out[i] = ch;
8703 }
8704 else if (Py_UNICODE_ISSPACE(ch)) {
8705 out[i] = ' ';
8706 }
8707 else {
8708 int decimal = Py_UNICODE_TODECIMAL(ch);
8709 if (decimal < 0) {
8710 out[i] = '?';
8711 out[i+1] = '\0';
8712 _PyUnicode_LENGTH(result) = i + 1;
8713 break;
8714 }
8715 out[i] = '0' + decimal;
8716 }
8717 }
8718
8719 assert(_PyUnicode_CheckConsistency(result, 1));
8720 return result;
8721 }
8722
8723 /* --- Helpers ------------------------------------------------------------ */
8724
8725 /* helper macro to fixup start/end slice values */
8726 #define ADJUST_INDICES(start, end, len) \
8727 if (end > len) \
8728 end = len; \
8729 else if (end < 0) { \
8730 end += len; \
8731 if (end < 0) \
8732 end = 0; \
8733 } \
8734 if (start < 0) { \
8735 start += len; \
8736 if (start < 0) \
8737 start = 0; \
8738 }
8739
8740 static Py_ssize_t
8741 2528627 any_find_slice(PyObject* s1, PyObject* s2,
8742 Py_ssize_t start,
8743 Py_ssize_t end,
8744 int direction)
8745 {
8746 int kind1, kind2;
8747 const void *buf1, *buf2;
8748 Py_ssize_t len1, len2, result;
8749
8750 2528627 kind1 = PyUnicode_KIND(s1);
8751 2528627 kind2 = PyUnicode_KIND(s2);
8752
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2528627 times.
2528627 if (kind1 < kind2)
8753 return -1;
8754
8755 2528627 len1 = PyUnicode_GET_LENGTH(s1);
8756 2528627 len2 = PyUnicode_GET_LENGTH(s2);
8757
4/10
✓ Branch 0 taken 2470821 times.
✓ Branch 1 taken 57806 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 57806 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2528627 times.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
2528627 ADJUST_INDICES(start, end, len1);
8758
2/2
✓ Branch 0 taken 1358 times.
✓ Branch 1 taken 2527269 times.
2528627 if (end - start < len2)
8759 1358 return -1;
8760
8761 2527269 buf1 = PyUnicode_DATA(s1);
8762 2527269 buf2 = PyUnicode_DATA(s2);
8763
2/2
✓ Branch 0 taken 2306856 times.
✓ Branch 1 taken 220413 times.
2527269 if (len2 == 1) {
8764 2306856 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
8765 2306856 result = findchar((const char *)buf1 + kind1*start,
8766 kind1, end - start, ch, direction);
8767
2/2
✓ Branch 0 taken 394605 times.
✓ Branch 1 taken 1912251 times.
2306856 if (result == -1)
8768 394605 return -1;
8769 else
8770 1912251 return start + result;
8771 }
8772
8773
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 220413 times.
220413 if (kind2 != kind1) {
8774 buf2 = unicode_askind(kind2, buf2, len2, kind1);
8775 if (!buf2)
8776 return -2;
8777 }
8778
8779
1/2
✓ Branch 0 taken 220413 times.
✗ Branch 1 not taken.
220413 if (direction > 0) {
8780
1/4
✓ Branch 0 taken 220413 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
220413 switch (kind1) {
8781 220413 case PyUnicode_1BYTE_KIND:
8782
2/4
✓ Branch 1 taken 220413 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 220413 times.
✗ Branch 5 not taken.
220413 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8783 220413 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8784 else
8785 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8786 220413 break;
8787 case PyUnicode_2BYTE_KIND:
8788 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8789 break;
8790 case PyUnicode_4BYTE_KIND:
8791 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8792 break;
8793 default:
8794 Py_UNREACHABLE();
8795 }
8796 }
8797 else {
8798 switch (kind1) {
8799 case PyUnicode_1BYTE_KIND:
8800 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8801 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8802 else
8803 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8804 break;
8805 case PyUnicode_2BYTE_KIND:
8806 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8807 break;
8808 case PyUnicode_4BYTE_KIND:
8809 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8810 break;
8811 default:
8812 Py_UNREACHABLE();
8813 }
8814 }
8815
8816 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
8817
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 220413 times.
220413 if (kind2 != kind1)
8818 PyMem_Free((void *)buf2);
8819
8820 220413 return result;
8821 }
8822
8823 /* _PyUnicode_InsertThousandsGrouping() helper functions */
8824 #include "stringlib/localeutil.h"
8825
8826 /**
8827 * InsertThousandsGrouping:
8828 * @writer: Unicode writer.
8829 * @n_buffer: Number of characters in @buffer.
8830 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
8831 * @d_pos: Start of digits string.
8832 * @n_digits: The number of digits in the string, in which we want
8833 * to put the grouping chars.
8834 * @min_width: The minimum width of the digits in the output string.
8835 * Output will be zero-padded on the left to fill.
8836 * @grouping: see definition in localeconv().
8837 * @thousands_sep: see definition in localeconv().
8838 *
8839 * There are 2 modes: counting and filling. If @writer is NULL,
8840 * we are in counting mode, else filling mode.
8841 * If counting, the required buffer size is returned.
8842 * If filling, we know the buffer will be large enough, so we don't
8843 * need to pass in the buffer size.
8844 * Inserts thousand grouping characters (as defined by grouping and
8845 * thousands_sep) into @writer.
8846 *
8847 * Return value: -1 on error, number of characters otherwise.
8848 **/
8849 Py_ssize_t
8850 739156 _PyUnicode_InsertThousandsGrouping(
8851 _PyUnicodeWriter *writer,
8852 Py_ssize_t n_buffer,
8853 PyObject *digits,
8854 Py_ssize_t d_pos,
8855 Py_ssize_t n_digits,
8856 Py_ssize_t min_width,
8857 const char *grouping,
8858 PyObject *thousands_sep,
8859 Py_UCS4 *maxchar)
8860 {
8861 739156 min_width = Py_MAX(0, min_width);
8862 if (writer) {
8863 assert(digits != NULL);
8864 assert(maxchar == NULL);
8865 }
8866 else {
8867 assert(digits == NULL);
8868 assert(maxchar != NULL);
8869 }
8870 assert(0 <= d_pos);
8871 assert(0 <= n_digits);
8872 assert(grouping != NULL);
8873
8874 739156 Py_ssize_t count = 0;
8875 Py_ssize_t n_zeros;
8876 739156 int loop_broken = 0;
8877 739156 int use_separator = 0; /* First time through, don't append the
8878 separator. They only go between
8879 groups. */
8880 Py_ssize_t buffer_pos;
8881 Py_ssize_t digits_pos;
8882 Py_ssize_t len;
8883 Py_ssize_t n_chars;
8884 739156 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
8885 be looked at */
8886 /* A generator that returns all of the grouping widths, until it
8887 returns 0. */
8888 GroupGenerator groupgen;
8889 739156 GroupGenerator_init(&groupgen, grouping);
8890 739156 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8891
8892 /* if digits are not grouped, thousands separator
8893 should be an empty string */
8894 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
8895
8896 739156 digits_pos = d_pos + n_digits;
8897
2/2
✓ Branch 0 taken 369578 times.
✓ Branch 1 taken 369578 times.
739156 if (writer) {
8898 369578 buffer_pos = writer->pos + n_buffer;
8899 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
8900 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
8901 }
8902 else {
8903 369578 buffer_pos = n_buffer;
8904 }
8905
8906
2/2
✓ Branch 0 taken 369578 times.
✓ Branch 1 taken 369578 times.
739156 if (!writer) {
8907 369578 *maxchar = 127;
8908 }
8909
8910
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 739156 times.
739156 while ((len = GroupGenerator_next(&groupgen)) > 0) {
8911 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
8912 n_zeros = Py_MAX(0, len - remaining);
8913 n_chars = Py_MAX(0, Py_MIN(remaining, len));
8914
8915 /* Use n_zero zero's and n_chars chars */
8916
8917 /* Count only, don't do anything. */
8918 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
8919
8920 /* Copy into the writer. */
8921 InsertThousandsGrouping_fill(writer, &buffer_pos,
8922 digits, &digits_pos,
8923 n_chars, n_zeros,
8924 use_separator ? thousands_sep : NULL,
8925 thousands_sep_len, maxchar);
8926
8927 /* Use a separator next time. */
8928 use_separator = 1;
8929
8930 remaining -= n_chars;
8931 min_width -= len;
8932
8933 if (remaining <= 0 && min_width <= 0) {
8934 loop_broken = 1;
8935 break;
8936 }
8937 min_width -= thousands_sep_len;
8938 }
8939
1/2
✓ Branch 0 taken 739156 times.
✗ Branch 1 not taken.
739156 if (!loop_broken) {
8940 /* We left the loop without using a break statement. */
8941
8942 739156 len = Py_MAX(Py_MAX(remaining, min_width), 1);
8943 739156 n_zeros = Py_MAX(0, len - remaining);
8944 739156 n_chars = Py_MAX(0, Py_MIN(remaining, len));
8945
8946 /* Use n_zero zero's and n_chars chars */
8947
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 739156 times.
739156 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
8948
8949 /* Copy into the writer. */
8950
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 739156 times.
739156 InsertThousandsGrouping_fill(writer, &buffer_pos,
8951 digits, &digits_pos,
8952 n_chars, n_zeros,
8953 use_separator ? thousands_sep : NULL,
8954 thousands_sep_len, maxchar);
8955 }
8956 739156 return count;
8957 }
8958
8959
8960 Py_ssize_t
8961 PyUnicode_Count(PyObject *str,
8962 PyObject *substr,
8963 Py_ssize_t start,
8964 Py_ssize_t end)
8965 {
8966 Py_ssize_t result;
8967 int kind1, kind2;
8968 const void *buf1 = NULL, *buf2 = NULL;
8969 Py_ssize_t len1, len2;
8970
8971 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
8972 return -1;
8973
8974 kind1 = PyUnicode_KIND(str);
8975 kind2 = PyUnicode_KIND(substr);
8976 if (kind1 < kind2)
8977 return 0;
8978
8979 len1 = PyUnicode_GET_LENGTH(str);
8980 len2 = PyUnicode_GET_LENGTH(substr);
8981 ADJUST_INDICES(start, end, len1);
8982 if (end - start < len2)
8983 return 0;
8984
8985 buf1 = PyUnicode_DATA(str);
8986 buf2 = PyUnicode_DATA(substr);
8987 if (kind2 != kind1) {
8988 buf2 = unicode_askind(kind2, buf2, len2, kind1);
8989 if (!buf2)
8990 goto onError;
8991 }
8992
8993 switch (kind1) {
8994 case PyUnicode_1BYTE_KIND:
8995 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
8996 result = asciilib_count(
8997 ((const Py_UCS1*)buf1) + start, end - start,
8998 buf2, len2, PY_SSIZE_T_MAX
8999 );
9000 else
9001 result = ucs1lib_count(
9002 ((const Py_UCS1*)buf1) + start, end - start,
9003 buf2, len2, PY_SSIZE_T_MAX
9004 );
9005 break;
9006 case PyUnicode_2BYTE_KIND:
9007 result = ucs2lib_count(
9008 ((const Py_UCS2*)buf1) + start, end - start,
9009 buf2, len2, PY_SSIZE_T_MAX
9010 );
9011 break;
9012 case PyUnicode_4BYTE_KIND:
9013 result = ucs4lib_count(
9014 ((const Py_UCS4*)buf1) + start, end - start,
9015 buf2, len2, PY_SSIZE_T_MAX
9016 );
9017 break;
9018 default:
9019 Py_UNREACHABLE();
9020 }
9021
9022 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9023 if (kind2 != kind1)
9024 PyMem_Free((void *)buf2);
9025
9026 return result;
9027 onError:
9028 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9029 if (kind2 != kind1)
9030 PyMem_Free((void *)buf2);
9031 return -1;
9032 }
9033
9034 Py_ssize_t
9035 PyUnicode_Find(PyObject *str,
9036 PyObject *substr,
9037 Py_ssize_t start,
9038 Py_ssize_t end,
9039 int direction)
9040 {
9041 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9042 return -2;
9043
9044 return any_find_slice(str, substr, start, end, direction);
9045 }
9046
9047 Py_ssize_t
9048 7654140 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9049 Py_ssize_t start, Py_ssize_t end,
9050 int direction)
9051 {
9052 int kind;
9053 Py_ssize_t len, result;
9054 7654140 len = PyUnicode_GET_LENGTH(str);
9055
3/10
✗ Branch 0 not taken.
✓ Branch 1 taken 7654140 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 7654140 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 7654140 times.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
7654140 ADJUST_INDICES(start, end, len);
9056
2/2
✓ Branch 0 taken 32 times.
✓ Branch 1 taken 7654108 times.
7654140 if (end - start < 1)
9057 32 return -1;
9058 7654108 kind = PyUnicode_KIND(str);
9059 7654108 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9060 kind, end-start, ch, direction);
9061
2/2
✓ Branch 0 taken 5861221 times.
✓ Branch 1 taken 1792887 times.
7654108 if (result == -1)
9062 5861221 return -1;
9063 else
9064 1792887 return start + result;
9065 }
9066
9067 static int
9068 16059940 tailmatch(PyObject *self,
9069 PyObject *substring,
9070 Py_ssize_t start,
9071 Py_ssize_t end,
9072 int direction)
9073 {
9074 int kind_self;
9075 int kind_sub;
9076 const void *data_self;
9077 const void *data_sub;
9078 Py_ssize_t offset;
9079 Py_ssize_t i;
9080 Py_ssize_t end_sub;
9081
9082
2/10
✓ Branch 1 taken 16059940 times.
✗ Branch 2 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
✓ Branch 10 taken 16059940 times.
✗ Branch 12 not taken.
✗ Branch 13 not taken.
16059940 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9083 16059940 end -= PyUnicode_GET_LENGTH(substring);
9084
2/2
✓ Branch 0 taken 627817 times.
✓ Branch 1 taken 15432123 times.
16059940 if (end < start)
9085 627817 return 0;
9086
9087
2/2
✓ Branch 1 taken 35777 times.
✓ Branch 2 taken 15396346 times.
15432123 if (PyUnicode_GET_LENGTH(substring) == 0)
9088 35777 return 1;
9089
9090 15396346 kind_self = PyUnicode_KIND(self);
9091 15396346 data_self = PyUnicode_DATA(self);
9092 15396346 kind_sub = PyUnicode_KIND(substring);
9093 15396346 data_sub = PyUnicode_DATA(substring);
9094 15396346 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9095
9096
2/2
✓ Branch 0 taken 4213697 times.
✓ Branch 1 taken 11182649 times.
15396346 if (direction > 0)
9097 4213697 offset = end;
9098 else
9099 11182649 offset = start;
9100
9101
2/2
✓ Branch 1 taken 5851701 times.
✓ Branch 2 taken 9544645 times.
30792692 if (PyUnicode_READ(kind_self, data_self, offset) ==
9102
2/2
✓ Branch 1 taken 4351157 times.
✓ Branch 2 taken 1500544 times.
21248047 PyUnicode_READ(kind_sub, data_sub, 0) &&
9103 5851701 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9104 5851701 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9105 /* If both are of the same kind, memcmp is sufficient */
9106
2/2
✓ Branch 0 taken 4351155 times.
✓ Branch 1 taken 2 times.
4351157 if (kind_self == kind_sub) {
9107 4351155 return ! memcmp((char *)data_self +
9108 4351155 (offset * PyUnicode_KIND(substring)),
9109 data_sub,
9110 4351155 PyUnicode_GET_LENGTH(substring) *
9111 4351155 PyUnicode_KIND(substring));
9112 }
9113 /* otherwise we have to compare each character by first accessing it */
9114 else {
9115 /* We do not need to compare 0 and len(substring)-1 because
9116 the if statement above ensured already that they are equal
9117 when we end up here. */
9118
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 for (i = 1; i < end_sub; ++i) {
9119 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9120 PyUnicode_READ(kind_sub, data_sub, i))
9121 return 0;
9122 }
9123 2 return 1;
9124 }
9125 }
9126
9127 11045189 return 0;
9128 }
9129
9130 Py_ssize_t
9131 2141 PyUnicode_Tailmatch(PyObject *str,
9132 PyObject *substr,
9133 Py_ssize_t start,
9134 Py_ssize_t end,
9135 int direction)
9136 {
9137
2/4
✓ Branch 1 taken 2141 times.
✗ Branch 2 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2141 times.
2141 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9138 return -1;
9139
9140 2141 return tailmatch(str, substr, start, end, direction);
9141 }
9142
9143 static PyObject *
9144 2608514 ascii_upper_or_lower(PyObject *self, int lower)
9145 {
9146 2608514 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9147 2608514 const char *data = PyUnicode_DATA(self);
9148 char *resdata;
9149 PyObject *res;
9150
9151 2608514 res = PyUnicode_New(len, 127);
9152
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2608514 times.
2608514 if (res == NULL)
9153 return NULL;
9154 2608514 resdata = PyUnicode_DATA(res);
9155
2/2
✓ Branch 0 taken 2479611 times.
✓ Branch 1 taken 128903 times.
2608514 if (lower)
9156 2479611 _Py_bytes_lower(resdata, data, len);
9157 else
9158 128903 _Py_bytes_upper(resdata, data, len);
9159 2608514 return res;
9160 }
9161
9162 static Py_UCS4
9163 handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9164 {
9165 Py_ssize_t j;
9166 int final_sigma;
9167 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
9168 /* U+03A3 is in the Final_Sigma context when, it is found like this:
9169
9170 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9171
9172 where ! is a negation and \p{xxx} is a character with property xxx.
9173 */
9174 for (j = i - 1; j >= 0; j--) {
9175 c = PyUnicode_READ(kind, data, j);
9176 if (!_PyUnicode_IsCaseIgnorable(c))
9177 break;
9178 }
9179 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9180 if (final_sigma) {
9181 for (j = i + 1; j < length; j++) {
9182 c = PyUnicode_READ(kind, data, j);
9183 if (!_PyUnicode_IsCaseIgnorable(c))
9184 break;
9185 }
9186 final_sigma = j == length || !_PyUnicode_IsCased(c);
9187 }
9188 return (final_sigma) ? 0x3C2 : 0x3C3;
9189 }
9190
9191 static int
9192 31597 lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9193 Py_UCS4 c, Py_UCS4 *mapped)
9194 {
9195 /* Obscure special case. */
9196
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 31597 times.
31597 if (c == 0x3A3) {
9197 mapped[0] = handle_capital_sigma(kind, data, length, i);
9198 return 1;
9199 }
9200 31597 return _PyUnicode_ToLowerFull(c, mapped);
9201 }
9202
9203 static Py_ssize_t
9204 757 do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9205 {
9206 757 Py_ssize_t i, k = 0;
9207 int n_res, j;
9208 Py_UCS4 c, mapped[3];
9209
9210 757 c = PyUnicode_READ(kind, data, 0);
9211 757 n_res = _PyUnicode_ToTitleFull(c, mapped);
9212
2/2
✓ Branch 0 taken 757 times.
✓ Branch 1 taken 757 times.
1514 for (j = 0; j < n_res; j++) {
9213 757 *maxchar = Py_MAX(*maxchar, mapped[j]);
9214 757 res[k++] = mapped[j];
9215 }
9216
2/2
✓ Branch 0 taken 4962 times.
✓ Branch 1 taken 757 times.
5719 for (i = 1; i < length; i++) {
9217 4962 c = PyUnicode_READ(kind, data, i);
9218 4962 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9219
2/2
✓ Branch 0 taken 4962 times.
✓ Branch 1 taken 4962 times.
9924 for (j = 0; j < n_res; j++) {
9220 4962 *maxchar = Py_MAX(*maxchar, mapped[j]);
9221 4962 res[k++] = mapped[j];
9222 }
9223 }
9224 757 return k;
9225 }
9226
9227 static Py_ssize_t
9228 do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9229 Py_ssize_t i, k = 0;
9230
9231 for (i = 0; i < length; i++) {
9232 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9233 int n_res, j;
9234 if (Py_UNICODE_ISUPPER(c)) {
9235 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9236 }
9237 else if (Py_UNICODE_ISLOWER(c)) {
9238 n_res = _PyUnicode_ToUpperFull(c, mapped);
9239 }
9240 else {
9241 n_res = 1;
9242 mapped[0] = c;
9243 }
9244 for (j = 0; j < n_res; j++) {
9245 *maxchar = Py_MAX(*maxchar, mapped[j]);
9246 res[k++] = mapped[j];
9247 }
9248 }
9249 return k;
9250 }
9251
9252 static Py_ssize_t
9253 466 do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9254 Py_UCS4 *maxchar, int lower)
9255 {
9256 466 Py_ssize_t i, k = 0;
9257
9258
2/2
✓ Branch 0 taken 466 times.
✓ Branch 1 taken 466 times.
932 for (i = 0; i < length; i++) {
9259 466 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9260 int n_res, j;
9261
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 466 times.
466 if (lower)
9262 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9263 else
9264 466 n_res = _PyUnicode_ToUpperFull(c, mapped);
9265
2/2
✓ Branch 0 taken 466 times.
✓ Branch 1 taken 466 times.
932 for (j = 0; j < n_res; j++) {
9266 466 *maxchar = Py_MAX(*maxchar, mapped[j]);
9267 466 res[k++] = mapped[j];
9268 }
9269 }
9270 466 return k;
9271 }
9272
9273 static Py_ssize_t
9274 466 do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9275 {
9276 466 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9277 }
9278
9279 static Py_ssize_t
9280 do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9281 {
9282 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9283 }
9284
9285 static Py_ssize_t
9286 do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9287 {
9288 Py_ssize_t i, k = 0;
9289
9290 for (i = 0; i < length; i++) {
9291 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9292 Py_UCS4 mapped[3];
9293 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9294 for (j = 0; j < n_res; j++) {
9295 *maxchar = Py_MAX(*maxchar, mapped[j]);
9296 res[k++] = mapped[j];
9297 }
9298 }
9299 return k;
9300 }
9301
9302 static Py_ssize_t
9303 5243 do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9304 {
9305 5243 Py_ssize_t i, k = 0;
9306 int previous_is_cased;
9307
9308 5243 previous_is_cased = 0;
9309
2/2
✓ Branch 0 taken 32165 times.
✓ Branch 1 taken 5243 times.
37408 for (i = 0; i < length; i++) {
9310 32165 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9311 Py_UCS4 mapped[3];
9312 int n_res, j;
9313
9314
2/2
✓ Branch 0 taken 26635 times.
✓ Branch 1 taken 5530 times.
32165 if (previous_is_cased)
9315 26635 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9316 else
9317 5530 n_res = _PyUnicode_ToTitleFull(c, mapped);
9318
9319
2/2
✓ Branch 0 taken 32165 times.
✓ Branch 1 taken 32165 times.
64330 for (j = 0; j < n_res; j++) {
9320 32165 *maxchar = Py_MAX(*maxchar, mapped[j]);
9321 32165 res[k++] = mapped[j];
9322 }
9323
9324 32165 previous_is_cased = _PyUnicode_IsCased(c);
9325 }
9326 5243 return k;
9327 }
9328
9329 static PyObject *
9330 6466 case_operation(PyObject *self,
9331 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9332 {
9333 6466 PyObject *res = NULL;
9334 6466 Py_ssize_t length, newlength = 0;
9335 int kind, outkind;
9336 const void *data;
9337 void *outdata;
9338 6466 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9339
9340 6466 kind = PyUnicode_KIND(self);
9341 6466 data = PyUnicode_DATA(self);
9342 6466 length = PyUnicode_GET_LENGTH(self);
9343
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6466 times.
6466 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9344 PyErr_SetString(PyExc_OverflowError, "string is too long");
9345 return NULL;
9346 }
9347 6466 tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9348
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6466 times.
6466 if (tmp == NULL)
9349 return PyErr_NoMemory();
9350 6466 newlength = perform(kind, data, length, tmp, &maxchar);
9351 6466 res = PyUnicode_New(newlength, maxchar);
9352
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6466 times.
6466 if (res == NULL)
9353 goto leave;
9354 6466 tmpend = tmp + newlength;
9355 6466 outdata = PyUnicode_DATA(res);
9356 6466 outkind = PyUnicode_KIND(res);
9357
2/4
✓ Branch 0 taken 6000 times.
✓ Branch 1 taken 466 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
6466 switch (outkind) {
9358 6000 case PyUnicode_1BYTE_KIND:
9359
4/4
✓ Branch 0 taken 6780 times.
✓ Branch 1 taken 6000 times.
✓ Branch 2 taken 10764 times.
✓ Branch 3 taken 6000 times.
23544 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9360 6000 break;
9361 466 case PyUnicode_2BYTE_KIND:
9362
3/4
✗ Branch 0 not taken.
✓ Branch 1 taken 466 times.
✓ Branch 2 taken 466 times.
✓ Branch 3 taken 466 times.
932 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9363 466 break;
9364 case PyUnicode_4BYTE_KIND:
9365 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9366 break;
9367 default:
9368 Py_UNREACHABLE();
9369 }
9370 6466 leave:
9371 6466 PyMem_Free(tmp);
9372 6466 return res;
9373 }
9374
9375 PyObject *
9376 10828842 PyUnicode_Join(PyObject *separator, PyObject *seq)
9377 {
9378 PyObject *res;
9379 PyObject *fseq;
9380 Py_ssize_t seqlen;
9381 PyObject **items;
9382
9383 10828842 fseq = PySequence_Fast(seq, "can only join an iterable");
9384
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 10828842 times.
10828842 if (fseq == NULL) {
9385 return NULL;
9386 }
9387
9388 /* NOTE: the following code can't call back into Python code,
9389 * so we are sure that fseq won't be mutated.
9390 */
9391
9392
2/2
✓ Branch 2 taken 10485962 times.
✓ Branch 3 taken 342880 times.
10828842 items = PySequence_Fast_ITEMS(fseq);
9393
2/2
✓ Branch 2 taken 10485962 times.
✓ Branch 3 taken 342880 times.
10828842 seqlen = PySequence_Fast_GET_SIZE(fseq);
9394 10828842 res = _PyUnicode_JoinArray(separator, items, seqlen);
9395 10828842 Py_DECREF(fseq);
9396 10828842 return res;
9397 }
9398
9399 PyObject *
9400 16039163 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9401 {
9402 16039163 PyObject *res = NULL; /* the result */
9403 16039163 PyObject *sep = NULL;
9404 Py_ssize_t seplen;
9405 PyObject *item;
9406 Py_ssize_t sz, i, res_offset;
9407 Py_UCS4 maxchar;
9408 Py_UCS4 item_maxchar;
9409 int use_memcpy;
9410 16039163 unsigned char *res_data = NULL, *sep_data = NULL;
9411 PyObject *last_obj;
9412 16039163 int kind = 0;
9413
9414 /* If empty sequence, return u"". */
9415
2/2
✓ Branch 0 taken 375834 times.
✓ Branch 1 taken 15663329 times.
16039163 if (seqlen == 0) {
9416 375834 _Py_RETURN_UNICODE_EMPTY();
9417 }
9418
9419 /* If singleton sequence with an exact Unicode, return that. */
9420 15663329 last_obj = NULL;
9421
2/2
✓ Branch 0 taken 4057066 times.
✓ Branch 1 taken 11606263 times.
15663329 if (seqlen == 1) {
9422
2/2
✓ Branch 1 taken 4056653 times.
✓ Branch 2 taken 413 times.
4057066 if (PyUnicode_CheckExact(items[0])) {
9423 4056653 res = items[0];
9424 4056653 Py_INCREF(res);
9425 4056653 return res;
9426 }
9427 413 seplen = 0;
9428 413 maxchar = 0;
9429 }
9430 else {
9431 /* Set up sep and seplen */
9432
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 11606263 times.
11606263 if (separator == NULL) {
9433 /* fall back to a blank space separator */
9434 sep = PyUnicode_FromOrdinal(' ');
9435 if (!sep)
9436 goto onError;
9437 seplen = 1;
9438 maxchar = 32;
9439 }
9440 else {
9441
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 11606263 times.
11606263 if (!PyUnicode_Check(separator)) {
9442 PyErr_Format(PyExc_TypeError,
9443 "separator: expected str instance,"
9444 " %.80s found",
9445 Py_TYPE(separator)->tp_name);
9446 goto onError;
9447 }
9448 11606263 sep = separator;
9449 11606263 seplen = PyUnicode_GET_LENGTH(separator);
9450 11606263 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9451 /* inc refcount to keep this code path symmetric with the
9452 above case of a blank separator */
9453 11606263 Py_INCREF(sep);
9454 }
9455 11606263 last_obj = sep;
9456 }
9457
9458 /* There are at least two things to join, or else we have a subclass
9459 * of str in the sequence.
9460 * Do a pre-pass to figure out the total amount of space we'll
9461 * need (sz), and see whether all argument are strings.
9462 */
9463 11606676 sz = 0;
9464 #ifdef Py_DEBUG
9465 use_memcpy = 0;
9466 #else
9467 11606676 use_memcpy = 1;
9468 #endif
9469
2/2
✓ Branch 0 taken 81267429 times.
✓ Branch 1 taken 11606676 times.
92874105 for (i = 0; i < seqlen; i++) {
9470 size_t add_sz;
9471 81267429 item = items[i];
9472
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 81267429 times.
81267429 if (!PyUnicode_Check(item)) {
9473 PyErr_Format(PyExc_TypeError,
9474 "sequence item %zd: expected str instance,"
9475 " %.80s found",
9476 i, Py_TYPE(item)->tp_name);
9477 goto onError;
9478 }
9479 81267429 add_sz = PyUnicode_GET_LENGTH(item);
9480 81267429 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9481 81267429 maxchar = Py_MAX(maxchar, item_maxchar);
9482
2/2
✓ Branch 0 taken 69660753 times.
✓ Branch 1 taken 11606676 times.
81267429 if (i != 0) {
9483 69660753 add_sz += seplen;
9484 }
9485
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 81267429 times.
81267429 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
9486 PyErr_SetString(PyExc_OverflowError,
9487 "join() result is too long for a Python string");
9488 goto onError;
9489 }
9490 81267429 sz += add_sz;
9491
4/4
✓ Branch 0 taken 80149069 times.
✓ Branch 1 taken 1118360 times.
✓ Branch 2 taken 80148656 times.
✓ Branch 3 taken 413 times.
81267429 if (use_memcpy && last_obj != NULL) {
9492
2/2
✓ Branch 0 taken 3531 times.
✓ Branch 1 taken 80145125 times.
80148656 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9493 3531 use_memcpy = 0;
9494 }
9495 81267429 last_obj = item;
9496 }
9497
9498 11606676 res = PyUnicode_New(sz, maxchar);
9499
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 11606676 times.
11606676 if (res == NULL)
9500 goto onError;
9501
9502 /* Catenate everything. */
9503 #ifdef Py_DEBUG
9504 use_memcpy = 0;
9505 #else
9506
2/2
✓ Branch 0 taken 11603145 times.
✓ Branch 1 taken 3531 times.
11606676 if (use_memcpy) {
9507 11603145 res_data = PyUnicode_1BYTE_DATA(res);
9508 11603145 kind = PyUnicode_KIND(res);
9509
2/2
✓ Branch 0 taken 4624140 times.
✓ Branch 1 taken 6979005 times.
11603145 if (seplen != 0)
9510 4624140 sep_data = PyUnicode_1BYTE_DATA(sep);
9511 }
9512 #endif
9513
2/2
✓ Branch 0 taken 11603145 times.
✓ Branch 1 taken 3531 times.
11606676 if (use_memcpy) {
9514
2/2
✓ Branch 0 taken 79982942 times.
✓ Branch 1 taken 11603145 times.
91586087 for (i = 0; i < seqlen; ++i) {
9515 Py_ssize_t itemlen;
9516 79982942 item = items[i];
9517
9518 /* Copy item, and maybe the separator. */
9519
4/4
✓ Branch 0 taken 68379797 times.
✓ Branch 1 taken 11603145 times.
✓ Branch 2 taken 8482143 times.
✓ Branch 3 taken 59897654 times.
79982942 if (i && seplen != 0) {
9520 8482143 memcpy(res_data,
9521 sep_data,
9522 8482143 kind * seplen);
9523 8482143 res_data += kind * seplen;
9524 }
9525
9526 79982942 itemlen = PyUnicode_GET_LENGTH(item);
9527
2/2
✓ Branch 0 taken 78097504 times.
✓ Branch 1 taken 1885438 times.
79982942 if (itemlen != 0) {
9528 156195008 memcpy(res_data,
9529 78097504 PyUnicode_DATA(item),
9530 78097504 kind * itemlen);
9531 78097504 res_data += kind * itemlen;
9532 }
9533 }
9534 assert(res_data == PyUnicode_1BYTE_DATA(res)
9535 + kind * PyUnicode_GET_LENGTH(res));
9536 }
9537 else {
9538
2/2
✓ Branch 0 taken 1284487 times.
✓ Branch 1 taken 3531 times.
1288018 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9539 Py_ssize_t itemlen;
9540 1284487 item = items[i];
9541
9542 /* Copy item, and maybe the separator. */
9543
4/4
✓ Branch 0 taken 1280956 times.
✓ Branch 1 taken 3531 times.
✓ Branch 2 taken 17708 times.
✓ Branch 3 taken 1263248 times.
1284487 if (i && seplen != 0) {
9544 17708 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9545 17708 res_offset += seplen;
9546 }
9547
9548 1284487 itemlen = PyUnicode_GET_LENGTH(item);
9549
1/2
✓ Branch 0 taken 1284487 times.
✗ Branch 1 not taken.
1284487 if (itemlen != 0) {
9550 1284487 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9551 1284487 res_offset += itemlen;
9552 }
9553 }
9554 assert(res_offset == PyUnicode_GET_LENGTH(res));
9555 }
9556
9557 11606676 Py_XDECREF(sep);
9558 assert(_PyUnicode_CheckConsistency(res, 1));
9559 11606676 return res;
9560
9561 onError:
9562 Py_XDECREF(sep);
9563 Py_XDECREF(res);
9564 return NULL;
9565 }
9566
9567 void
9568 8882 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9569 Py_UCS4 fill_char)
9570 {
9571 8882 const int kind = PyUnicode_KIND(unicode);
9572 8882 void *data = PyUnicode_DATA(unicode);
9573 assert(unicode_modifiable(unicode));
9574 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9575 assert(start >= 0);
9576 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9577 8882 unicode_fill(kind, data, fill_char, start, length);
9578 8882 }
9579
9580 Py_ssize_t
9581 5154 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9582 Py_UCS4 fill_char)
9583 {
9584 Py_ssize_t maxlen;
9585
9586
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 5154 times.
5154 if (!PyUnicode_Check(unicode)) {
9587 PyErr_BadInternalCall();
9588 return -1;
9589 }
9590
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 5154 times.
5154 if (unicode_check_modifiable(unicode))
9591 return -1;
9592
9593
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5154 times.
5154 if (start < 0) {
9594 PyErr_SetString(PyExc_IndexError, "string index out of range");
9595 return -1;
9596 }
9597
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 5154 times.
5154 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9598 PyErr_SetString(PyExc_ValueError,
9599 "fill character is bigger than "
9600 "the string maximum character");
9601 return -1;
9602 }
9603
9604 5154 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9605 5154 length = Py_MIN(maxlen, length);
9606
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5154 times.
5154 if (length <= 0)
9607 return 0;
9608
9609 5154 _PyUnicode_FastFill(unicode, start, length, fill_char);
9610 5154 return length;
9611 }
9612
9613 static PyObject *
9614 1420 pad(PyObject *self,
9615 Py_ssize_t left,
9616 Py_ssize_t right,
9617 Py_UCS4 fill)
9618 {
9619 PyObject *u;
9620 Py_UCS4 maxchar;
9621 int kind;
9622 void *data;
9623
9624
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1420 times.
1420 if (left < 0)
9625 left = 0;
9626
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1420 times.
1420 if (right < 0)
9627 right = 0;
9628
9629
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 1420 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
1420 if (left == 0 && right == 0)
9630 return unicode_result_unchanged(self);
9631
9632
1/2
✓ Branch 0 taken 1420 times.
✗ Branch 1 not taken.
1420 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9633
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1420 times.
1420 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9634 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9635 return NULL;
9636 }
9637 1420 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9638 1420 maxchar = Py_MAX(maxchar, fill);
9639 1420 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9640
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1420 times.
1420 if (!u)
9641 return NULL;
9642
9643 1420 kind = PyUnicode_KIND(u);
9644 1420 data = PyUnicode_DATA(u);
9645
1/2
✓ Branch 0 taken 1420 times.
✗ Branch 1 not taken.
1420 if (left)
9646 1420 unicode_fill(kind, data, fill, 0, left);
9647
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1420 times.
1420 if (right)
9648 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9649 1420 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
9650 assert(_PyUnicode_CheckConsistency(u, 1));
9651 1420 return u;
9652 }
9653
9654 PyObject *
9655 24472 PyUnicode_Splitlines(PyObject *string, int keepends)
9656 {
9657 PyObject *list;
9658
9659
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 24472 times.
24472 if (ensure_unicode(string) < 0)
9660 return NULL;
9661
9662
3/4
✓ Branch 0 taken 20958 times.
✓ Branch 1 taken 3512 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
24472 switch (PyUnicode_KIND(string)) {
9663 20958 case PyUnicode_1BYTE_KIND:
9664
2/2
✓ Branch 1 taken 20954 times.
✓ Branch 2 taken 4 times.
20958 if (PyUnicode_IS_ASCII(string))
9665 41908 list = asciilib_splitlines(
9666 20954 string, PyUnicode_1BYTE_DATA(string),
9667 PyUnicode_GET_LENGTH(string), keepends);
9668 else
9669 8 list = ucs1lib_splitlines(
9670 4 string, PyUnicode_1BYTE_DATA(string),
9671 PyUnicode_GET_LENGTH(string), keepends);
9672 20958 break;
9673 3512 case PyUnicode_2BYTE_KIND:
9674 7024 list = ucs2lib_splitlines(
9675 3512 string, PyUnicode_2BYTE_DATA(string),
9676 PyUnicode_GET_LENGTH(string), keepends);
9677 3512 break;
9678 2 case PyUnicode_4BYTE_KIND:
9679 4 list = ucs4lib_splitlines(
9680 2 string, PyUnicode_4BYTE_DATA(string),
9681 PyUnicode_GET_LENGTH(string), keepends);
9682 2 break;
9683 default:
9684 Py_UNREACHABLE();
9685 }
9686 24472 return list;
9687 }
9688
9689 static PyObject *
9690 1321023 split(PyObject *self,
9691 PyObject *substring,
9692 Py_ssize_t maxcount)
9693 {
9694 int kind1, kind2;
9695 const void *buf1, *buf2;
9696 Py_ssize_t len1, len2;
9697 PyObject* out;
9698
9699
2/2
✓ Branch 0 taken 695019 times.
✓ Branch 1 taken 626004 times.
1321023 if (maxcount < 0)
9700 695019 maxcount = PY_SSIZE_T_MAX;
9701
9702
2/2
✓ Branch 0 taken 93325 times.
✓ Branch 1 taken 1227698 times.
1321023 if (substring == NULL)
9703
1/4
✓ Branch 0 taken 93325 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
93325 switch (PyUnicode_KIND(self)) {
9704 93325 case PyUnicode_1BYTE_KIND:
9705
1/2
✓ Branch 1 taken 93325 times.
✗ Branch 2 not taken.
93325 if (PyUnicode_IS_ASCII(self))
9706 186650 return asciilib_split_whitespace(
9707 93325 self, PyUnicode_1BYTE_DATA(self),
9708 PyUnicode_GET_LENGTH(self), maxcount
9709 );
9710 else
9711 return ucs1lib_split_whitespace(
9712 self, PyUnicode_1BYTE_DATA(self),
9713 PyUnicode_GET_LENGTH(self), maxcount
9714 );
9715 case PyUnicode_2BYTE_KIND:
9716 return ucs2lib_split_whitespace(
9717 self, PyUnicode_2BYTE_DATA(self),
9718 PyUnicode_GET_LENGTH(self), maxcount
9719 );
9720 case PyUnicode_4BYTE_KIND:
9721 return ucs4lib_split_whitespace(
9722 self, PyUnicode_4BYTE_DATA(self),
9723 PyUnicode_GET_LENGTH(self), maxcount
9724 );
9725 default:
9726 Py_UNREACHABLE();
9727 }
9728
9729 1227698 kind1 = PyUnicode_KIND(self);
9730 1227698 kind2 = PyUnicode_KIND(substring);
9731 1227698 len1 = PyUnicode_GET_LENGTH(self);
9732 1227698 len2 = PyUnicode_GET_LENGTH(substring);
9733
3/4
✓ Branch 0 taken 1227698 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 20930 times.
✓ Branch 3 taken 1206768 times.
1227698 if (kind1 < kind2 || len1 < len2) {
9734 20930 out = PyList_New(1);
9735
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 20930 times.
20930 if (out == NULL)
9736 return NULL;
9737 20930 Py_INCREF(self);
9738 20930 PyList_SET_ITEM(out, 0, self);
9739 20930 return out;
9740 }
9741 1206768 buf1 = PyUnicode_DATA(self);
9742 1206768 buf2 = PyUnicode_DATA(substring);
9743
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1206768 times.
1206768 if (kind2 != kind1) {
9744 buf2 = unicode_askind(kind2, buf2, len2, kind1);
9745 if (!buf2)
9746 return NULL;
9747 }
9748
9749
1/4
✓ Branch 0 taken 1206768 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
1206768 switch (kind1) {
9750 1206768 case PyUnicode_1BYTE_KIND:
9751
3/4
✓ Branch 1 taken 1206293 times.
✓ Branch 2 taken 475 times.
✓ Branch 4 taken 1206293 times.
✗ Branch 5 not taken.
1206768 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9752 1206293 out = asciilib_split(
9753 self, buf1, len1, buf2, len2, maxcount);
9754 else
9755 475 out = ucs1lib_split(
9756 self, buf1, len1, buf2, len2, maxcount);
9757 1206768 break;
9758 case PyUnicode_2BYTE_KIND:
9759 out = ucs2lib_split(
9760 self, buf1, len1, buf2, len2, maxcount);
9761 break;
9762 case PyUnicode_4BYTE_KIND:
9763 out = ucs4lib_split(
9764 self, buf1, len1, buf2, len2, maxcount);
9765 break;
9766 default:
9767 out = NULL;
9768 }
9769 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
9770
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1206768 times.
1206768 if (kind2 != kind1)
9771 PyMem_Free((void *)buf2);
9772 1206768 return out;
9773 }
9774
9775 static PyObject *
9776 9567 rsplit(PyObject *self,
9777 PyObject *substring,
9778 Py_ssize_t maxcount)
9779 {
9780 int kind1, kind2;
9781 const void *buf1, *buf2;
9782 Py_ssize_t len1, len2;
9783 PyObject* out;
9784
9785
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 9567 times.
9567 if (maxcount < 0)
9786 maxcount = PY_SSIZE_T_MAX;
9787
9788
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 9567 times.
9567 if (substring == NULL)
9789 switch (PyUnicode_KIND(self)) {
9790 case PyUnicode_1BYTE_KIND:
9791 if (PyUnicode_IS_ASCII(self))
9792 return asciilib_rsplit_whitespace(
9793 self, PyUnicode_1BYTE_DATA(self),
9794 PyUnicode_GET_LENGTH(self), maxcount
9795 );
9796 else
9797 return ucs1lib_rsplit_whitespace(
9798 self, PyUnicode_1BYTE_DATA(self),
9799 PyUnicode_GET_LENGTH(self), maxcount
9800 );
9801 case PyUnicode_2BYTE_KIND:
9802 return ucs2lib_rsplit_whitespace(
9803 self, PyUnicode_2BYTE_DATA(self),
9804 PyUnicode_GET_LENGTH(self), maxcount
9805 );
9806 case PyUnicode_4BYTE_KIND:
9807 return ucs4lib_rsplit_whitespace(
9808 self, PyUnicode_4BYTE_DATA(self),
9809 PyUnicode_GET_LENGTH(self), maxcount
9810 );
9811 default:
9812 Py_UNREACHABLE();
9813 }
9814
9815 9567 kind1 = PyUnicode_KIND(self);
9816 9567 kind2 = PyUnicode_KIND(substring);
9817 9567 len1 = PyUnicode_GET_LENGTH(self);
9818 9567 len2 = PyUnicode_GET_LENGTH(substring);
9819
3/4
✓ Branch 0 taken 9567 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 22 times.
✓ Branch 3 taken 9545 times.
9567 if (kind1 < kind2 || len1 < len2) {
9820 22 out = PyList_New(1);
9821
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 22 times.
22 if (out == NULL)
9822 return NULL;
9823 22 Py_INCREF(self);
9824 22 PyList_SET_ITEM(out, 0, self);
9825 22 return out;
9826 }
9827 9545 buf1 = PyUnicode_DATA(self);
9828 9545 buf2 = PyUnicode_DATA(substring);
9829
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 9545 times.
9545 if (kind2 != kind1) {
9830 buf2 = unicode_askind(kind2, buf2, len2, kind1);
9831 if (!buf2)
9832 return NULL;
9833 }
9834
9835
1/4
✓ Branch 0 taken 9545 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
9545 switch (kind1) {
9836 9545 case PyUnicode_1BYTE_KIND:
9837
2/4
✓ Branch 1 taken 9545 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 9545 times.
✗ Branch 5 not taken.
9545 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9838 9545 out = asciilib_rsplit(
9839 self, buf1, len1, buf2, len2, maxcount);
9840 else
9841 out = ucs1lib_rsplit(
9842 self, buf1, len1, buf2, len2, maxcount);
9843 9545 break;
9844 case PyUnicode_2BYTE_KIND:
9845 out = ucs2lib_rsplit(
9846 self, buf1, len1, buf2, len2, maxcount);
9847 break;
9848 case PyUnicode_4BYTE_KIND:
9849 out = ucs4lib_rsplit(
9850 self, buf1, len1, buf2, len2, maxcount);
9851 break;
9852 default:
9853 out = NULL;
9854 }
9855 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
9856
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 9545 times.
9545 if (kind2 != kind1)
9857 PyMem_Free((void *)buf2);
9858 9545 return out;
9859 }
9860
9861 static Py_ssize_t
9862 175376 anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
9863 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9864 {
9865
1/4
✓ Branch 0 taken 175376 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
175376 switch (kind) {
9866 175376 case PyUnicode_1BYTE_KIND:
9867
2/4
✓ Branch 1 taken 175376 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 175376 times.
✗ Branch 5 not taken.
175376 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9868 175376 return asciilib_find(buf1, len1, buf2, len2, offset);
9869 else
9870 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9871 case PyUnicode_2BYTE_KIND:
9872 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9873 case PyUnicode_4BYTE_KIND:
9874 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9875 }
9876 Py_UNREACHABLE();
9877 }
9878
9879 static Py_ssize_t
9880 14489573 anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
9881 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9882 {
9883
1/4
✓ Branch 0 taken 14489573 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
14489573 switch (kind) {
9884 14489573 case PyUnicode_1BYTE_KIND:
9885
3/4
✓ Branch 1 taken 14489489 times.
✓ Branch 2 taken 84 times.
✓ Branch 4 taken 14489489 times.
✗ Branch 5 not taken.
14489573 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9886 14489489 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9887 else
9888 84 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9889 case PyUnicode_2BYTE_KIND:
9890 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9891 case PyUnicode_4BYTE_KIND:
9892 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9893 }
9894 Py_UNREACHABLE();
9895 }
9896
9897 static void
9898 50579 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
9899 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
9900 {
9901 50579 int kind = PyUnicode_KIND(u);
9902 50579 void *data = PyUnicode_DATA(u);
9903 50579 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
9904
1/2
✓ Branch 0 taken 50579 times.
✗ Branch 1 not taken.
50579 if (kind == PyUnicode_1BYTE_KIND) {
9905 50579 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
9906 (Py_UCS1 *)data + len,
9907 u1, u2, maxcount);
9908 }
9909 else if (kind == PyUnicode_2BYTE_KIND) {
9910 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
9911 (Py_UCS2 *)data + len,
9912 u1, u2, maxcount);
9913 }
9914 else {
9915 assert(kind == PyUnicode_4BYTE_KIND);
9916 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
9917 (Py_UCS4 *)data + len,
9918 u1, u2, maxcount);
9919 }
9920 50579 }
9921
9922 static PyObject *
9923 14796915 replace(PyObject *self, PyObject *str1,
9924 PyObject *str2, Py_ssize_t maxcount)
9925 {
9926 PyObject *u;
9927 14796915 const char *sbuf = PyUnicode_DATA(self);
9928 14796915 const void *buf1 = PyUnicode_DATA(str1);
9929 14796915 const void *buf2 = PyUnicode_DATA(str2);
9930 14796915 int srelease = 0, release1 = 0, release2 = 0;
9931 14796915 int skind = PyUnicode_KIND(self);
9932 14796915 int kind1 = PyUnicode_KIND(str1);
9933 14796915 int kind2 = PyUnicode_KIND(str2);
9934 14796915 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9935 14796915 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9936 14796915 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
9937 int mayshrink;
9938 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
9939
9940
2/2
✓ Branch 0 taken 75759 times.
✓ Branch 1 taken 14721156 times.
14796915 if (slen < len1)
9941 75759 goto nothing;
9942
9943
1/2
✓ Branch 0 taken 14721156 times.
✗ Branch 1 not taken.
14721156 if (maxcount < 0)
9944 14721156 maxcount = PY_SSIZE_T_MAX;
9945 else if (maxcount == 0)
9946 goto nothing;
9947
9948
2/2
✓ Branch 0 taken 687 times.
✓ Branch 1 taken 14720469 times.
14721156 if (str1 == str2)
9949 687 goto nothing;
9950
9951 14720469 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9952 14720469 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
9953
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14720469 times.
14720469 if (maxchar < maxchar_str1)
9954 /* substring too wide to be present */
9955 goto nothing;
9956 14720469 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9957 /* Replacing str1 with str2 may cause a maxchar reduction in the
9958 result string. */
9959
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 14720469 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
14720469 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
9960 14720469 maxchar = Py_MAX(maxchar, maxchar_str2);
9961
9962
2/2
✓ Branch 0 taken 230896 times.
✓ Branch 1 taken 14489573 times.
14720469 if (len1 == len2) {
9963 /* same length */
9964
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 230896 times.
230896 if (len1 == 0)
9965 goto nothing;
9966
2/2
✓ Branch 0 taken 225497 times.
✓ Branch 1 taken 5399 times.
230896 if (len1 == 1) {
9967 /* replace characters */
9968 Py_UCS4 u1, u2;
9969 Py_ssize_t pos;
9970
9971 225497 u1 = PyUnicode_READ(kind1, buf1, 0);
9972 225497 pos = findchar(sbuf, skind, slen, u1, 1);
9973
2/2
✓ Branch 0 taken 174918 times.
✓ Branch 1 taken 50579 times.
225497 if (pos < 0)
9974 174918 goto nothing;
9975 50579 u2 = PyUnicode_READ(kind2, buf2, 0);
9976 50579 u = PyUnicode_New(slen, maxchar);
9977
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 50579 times.
50579 if (!u)
9978 goto error;
9979
9980 50579 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
9981 50579 replace_1char_inplace(u, pos, u1, u2, maxcount);
9982 }
9983 else {
9984 5399 int rkind = skind;
9985 char *res;
9986 Py_ssize_t i;
9987
9988
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5399 times.
5399 if (kind1 < rkind) {
9989 /* widen substring */
9990 buf1 = unicode_askind(kind1, buf1, len1, rkind);
9991 if (!buf1) goto error;
9992 release1 = 1;
9993 }
9994 5399 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
9995
2/2
✓ Branch 0 taken 2280 times.
✓ Branch 1 taken 3119 times.
5399 if (i < 0)
9996 2280 goto nothing;
9997
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3119 times.
3119 if (rkind > kind2) {
9998 /* widen replacement */
9999 buf2 = unicode_askind(kind2, buf2, len2, rkind);
10000 if (!buf2) goto error;
10001 release2 = 1;
10002 }
10003
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3119 times.
3119 else if (rkind < kind2) {
10004 /* widen self and buf1 */
10005 rkind = kind2;
10006 if (release1) {
10007 assert(buf1 != PyUnicode_DATA(str1));
10008 PyMem_Free((void *)buf1);
10009 buf1 = PyUnicode_DATA(str1);
10010 release1 = 0;
10011 }
10012 sbuf = unicode_askind(skind, sbuf, slen, rkind);
10013 if (!sbuf) goto error;
10014 srelease = 1;
10015 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10016 if (!buf1) goto error;
10017 release1 = 1;
10018 }
10019 3119 u = PyUnicode_New(slen, maxchar);
10020
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3119 times.
3119 if (!u)
10021 goto error;
10022 assert(PyUnicode_KIND(u) == rkind);
10023 3119 res = PyUnicode_DATA(u);
10024
10025 3119 memcpy(res, sbuf, rkind * slen);
10026 /* change everything in-place, starting with this one */
10027 3119 memcpy(res + rkind * i,
10028 buf2,
10029 3119 rkind * len2);
10030 3119 i += len1;
10031
10032
1/2
✓ Branch 0 taken 3212 times.
✗ Branch 1 not taken.
3212 while ( --maxcount > 0) {
10033 3212 i = anylib_find(rkind, self,
10034 3212 sbuf+rkind*i, slen-i,
10035 str1, buf1, len1, i);
10036
2/2
✓ Branch 0 taken 3119 times.
✓ Branch 1 taken 93 times.
3212 if (i == -1)
10037 3119 break;
10038 93 memcpy(res + rkind * i,
10039 buf2,
10040 93 rkind * len2);
10041 93 i += len1;
10042 }
10043 }
10044 }
10045 else {
10046 Py_ssize_t n, i, j, ires;
10047 Py_ssize_t new_size;
10048 14489573 int rkind = skind;
10049 char *res;
10050
10051
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14489573 times.
14489573 if (kind1 < rkind) {
10052 /* widen substring */
10053 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10054 if (!buf1) goto error;
10055 release1 = 1;
10056 }
10057 14489573 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10058
2/2
✓ Branch 0 taken 14390407 times.
✓ Branch 1 taken 99166 times.
14489573 if (n == 0)
10059 14390407 goto nothing;
10060
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 99166 times.
99166 if (kind2 < rkind) {
10061 /* widen replacement */
10062 buf2 = unicode_askind(kind2, buf2, len2, rkind);
10063 if (!buf2) goto error;
10064 release2 = 1;
10065 }
10066
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 99166 times.
99166 else if (kind2 > rkind) {
10067 /* widen self and buf1 */
10068 rkind = kind2;
10069 sbuf = unicode_askind(skind, sbuf, slen, rkind);
10070 if (!sbuf) goto error;
10071 srelease = 1;
10072 if (release1) {
10073 assert(buf1 != PyUnicode_DATA(str1));
10074 PyMem_Free((void *)buf1);
10075 buf1 = PyUnicode_DATA(str1);
10076 release1 = 0;
10077 }
10078 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10079 if (!buf1) goto error;
10080 release1 = 1;
10081 }
10082 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10083 PyUnicode_GET_LENGTH(str1)); */
10084
3/4
✓ Branch 0 taken 9633 times.
✓ Branch 1 taken 89533 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 9633 times.
99166 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10085 PyErr_SetString(PyExc_OverflowError,
10086 "replace string is too long");
10087 goto error;
10088 }
10089 99166 new_size = slen + n * (len2 - len1);
10090
2/2
✓ Branch 0 taken 67 times.
✓ Branch 1 taken 99099 times.
99166 if (new_size == 0) {
10091 67 u = unicode_new_empty();
10092 67 goto done;
10093 }
10094
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 99099 times.
99099 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10095 PyErr_SetString(PyExc_OverflowError,
10096 "replace string is too long");
10097 goto error;
10098 }
10099 99099 u = PyUnicode_New(new_size, maxchar);
10100
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 99099 times.
99099 if (!u)
10101 goto error;
10102 assert(PyUnicode_KIND(u) == rkind);
10103 99099 res = PyUnicode_DATA(u);
10104 99099 ires = i = 0;
10105
1/2
✓ Branch 0 taken 99099 times.
✗ Branch 1 not taken.
99099 if (len1 > 0) {
10106
2/2
✓ Branch 0 taken 166765 times.
✓ Branch 1 taken 99099 times.
265864 while (n-- > 0) {
10107 /* look for next match */
10108 166765 j = anylib_find(rkind, self,
10109 166765 sbuf + rkind * i, slen-i,
10110 str1, buf1, len1, i);
10111
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 166765 times.
166765 if (j == -1)
10112 break;
10113
2/2
✓ Branch 0 taken 152885 times.
✓ Branch 1 taken 13880 times.
166765 else if (j > i) {
10114 /* copy unchanged part [i:j] */
10115 152885 memcpy(res + rkind * ires,
10116 152885 sbuf + rkind * i,
10117 152885 rkind * (j-i));
10118 152885 ires += j - i;
10119 }
10120 /* copy substitution string */
10121
2/2
✓ Branch 0 taken 69947 times.
✓ Branch 1 taken 96818 times.
166765 if (len2 > 0) {
10122 69947 memcpy(res + rkind * ires,
10123 buf2,
10124 69947 rkind * len2);
10125 69947 ires += len2;
10126 }
10127 166765 i = j + len1;
10128 }
10129
2/2
✓ Branch 0 taken 95406 times.
✓ Branch 1 taken 3693 times.
99099 if (i < slen)
10130 /* copy tail [i:] */
10131 95406 memcpy(res + rkind * ires,
10132 95406 sbuf + rkind * i,
10133 95406 rkind * (slen-i));
10134 }
10135 else {
10136 /* interleave */
10137 while (n > 0) {
10138 memcpy(res + rkind * ires,
10139 buf2,
10140 rkind * len2);
10141 ires += len2;
10142 if (--n <= 0)
10143 break;
10144 memcpy(res + rkind * ires,
10145 sbuf + rkind * i,
10146 rkind);
10147 ires++;
10148 i++;
10149 }
10150 memcpy(res + rkind * ires,
10151 sbuf + rkind * i,
10152 rkind * (slen-i));
10153 }
10154 }
10155
10156
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 152797 times.
152797 if (mayshrink) {
10157 unicode_adjust_maxchar(&u);
10158 if (u == NULL)
10159 goto error;
10160 }
10161
10162 152797 done:
10163 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10164 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10165 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10166
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 152864 times.
152864 if (srelease)
10167 PyMem_Free((void *)sbuf);
10168
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 152864 times.
152864 if (release1)
10169 PyMem_Free((void *)buf1);
10170
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 152864 times.
152864 if (release2)
10171 PyMem_Free((void *)buf2);
10172 assert(_PyUnicode_CheckConsistency(u, 1));
10173 152864 return u;
10174
10175 14644051 nothing:
10176 /* nothing to replace; return original string (when possible) */
10177 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10178 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10179 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10180
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14644051 times.
14644051 if (srelease)
10181 PyMem_Free((void *)sbuf);
10182
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14644051 times.
14644051 if (release1)
10183 PyMem_Free((void *)buf1);
10184
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14644051 times.
14644051 if (release2)
10185 PyMem_Free((void *)buf2);
10186 14644051 return unicode_result_unchanged(self);
10187
10188 error:
10189 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10190 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10191 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10192 if (srelease)
10193 PyMem_Free((void *)sbuf);
10194 if (release1)
10195 PyMem_Free((void *)buf1);
10196 if (release2)
10197 PyMem_Free((void *)buf2);
10198 return NULL;
10199 }
10200
10201 /* --- Unicode Object Methods --------------------------------------------- */
10202
10203 /*[clinic input]
10204 str.title as unicode_title
10205
10206 Return a version of the string where each word is titlecased.
10207
10208 More specifically, words start with uppercased characters and all remaining
10209 cased characters have lower case.
10210 [clinic start generated code]*/
10211
10212 static PyObject *
10213 5243 unicode_title_impl(PyObject *self)
10214 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10215 {
10216 5243 return case_operation(self, do_title);
10217 }
10218
10219 /*[clinic input]
10220 str.capitalize as unicode_capitalize
10221
10222 Return a capitalized version of the string.
10223
10224 More specifically, make the first character have upper case and the rest lower
10225 case.
10226 [clinic start generated code]*/
10227
10228 static PyObject *
10229 757 unicode_capitalize_impl(PyObject *self)
10230 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10231 {
10232
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 757 times.
757 if (PyUnicode_GET_LENGTH(self) == 0)
10233 return unicode_result_unchanged(self);
10234 757 return case_operation(self, do_capitalize);
10235 }
10236
10237 /*[clinic input]
10238 str.casefold as unicode_casefold
10239
10240 Return a version of the string suitable for caseless comparisons.
10241 [clinic start generated code]*/
10242
10243 static PyObject *
10244 6720 unicode_casefold_impl(PyObject *self)
10245 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10246 {
10247
1/2
✓ Branch 1 taken 6720 times.
✗ Branch 2 not taken.
6720 if (PyUnicode_IS_ASCII(self))
10248 6720 return ascii_upper_or_lower(self, 1);
10249 return case_operation(self, do_casefold);
10250 }
10251
10252
10253 /* Argument converter. Accepts a single Unicode character. */
10254
10255 static int
10256 convert_uc(PyObject *obj, void *addr)
10257 {
10258 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10259
10260 if (!PyUnicode_Check(obj)) {
10261 PyErr_Format(PyExc_TypeError,
10262 "The fill character must be a unicode character, "
10263 "not %.100s", Py_TYPE(obj)->tp_name);
10264 return 0;
10265 }
10266 if (PyUnicode_GET_LENGTH(obj) != 1) {
10267 PyErr_SetString(PyExc_TypeError,
10268 "The fill character must be exactly one character long");
10269 return 0;
10270 }
10271 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10272 return 1;
10273 }
10274
10275 /*[clinic input]
10276 str.center as unicode_center
10277
10278 width: Py_ssize_t
10279 fillchar: Py_UCS4 = ' '
10280 /
10281
10282 Return a centered string of length width.
10283
10284 Padding is done using the specified fill character (default is a space).
10285 [clinic start generated code]*/
10286
10287 static PyObject *
10288 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10289 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10290 {
10291 Py_ssize_t marg, left;
10292
10293 if (PyUnicode_GET_LENGTH(self) >= width)
10294 return unicode_result_unchanged(self);
10295
10296 marg = width - PyUnicode_GET_LENGTH(self);
10297 left = marg / 2 + (marg & width & 1);
10298
10299 return pad(self, left, marg - left, fillchar);
10300 }
10301
10302 /* This function assumes that str1 and str2 are readied by the caller. */
10303
10304 static int
10305 5965739 unicode_compare(PyObject *str1, PyObject *str2)
10306 {
10307 #define COMPARE(TYPE1, TYPE2) \
10308 do { \
10309 TYPE1* p1 = (TYPE1 *)data1; \
10310 TYPE2* p2 = (TYPE2 *)data2; \
10311 TYPE1* end = p1 + len; \
10312 Py_UCS4 c1, c2; \
10313 for (; p1 != end; p1++, p2++) { \
10314 c1 = *p1; \
10315 c2 = *p2; \
10316 if (c1 != c2) \
10317 return (c1 < c2) ? -1 : 1; \
10318 } \
10319 } \
10320 while (0)
10321
10322 int kind1, kind2;
10323 const void *data1, *data2;
10324 Py_ssize_t len1, len2, len;
10325
10326 5965739 kind1 = PyUnicode_KIND(str1);
10327 5965739 kind2 = PyUnicode_KIND(str2);
10328 5965739 data1 = PyUnicode_DATA(str1);
10329 5965739 data2 = PyUnicode_DATA(str2);
10330 5965739 len1 = PyUnicode_GET_LENGTH(str1);
10331 5965739 len2 = PyUnicode_GET_LENGTH(str2);
10332 5965739 len = Py_MIN(len1, len2);
10333
10334
3/4
✓ Branch 0 taken 3240354 times.
✓ Branch 1 taken 2725382 times.
✓ Branch 2 taken 3 times.
✗ Branch 3 not taken.
5965739 switch(kind1) {
10335
2/4
✓ Branch 0 taken 3123931 times.
✓ Branch 1 taken 116423 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
3240354 case PyUnicode_1BYTE_KIND:
10336 {
10337 switch(kind2) {
10338 3123931 case PyUnicode_1BYTE_KIND:
10339 {
10340 3123931 int cmp = memcmp(data1, data2, len);
10341 /* normalize result of memcmp() into the range [-1; 1] */
10342
2/2
✓ Branch 0 taken 583506 times.
✓ Branch 1 taken 2540425 times.
3123931 if (cmp < 0)
10343 583506 return -1;
10344
2/2
✓ Branch 0 taken 2143848 times.
✓ Branch 1 taken 396577 times.
2540425 if (cmp > 0)
10345 2143848 return 1;
10346 396577 break;
10347 }
10348 116423 case PyUnicode_2BYTE_KIND:
10349
3/6
✓ Branch 0 taken 116423 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 116423 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 116423 times.
✗ Branch 5 not taken.
116423 COMPARE(Py_UCS1, Py_UCS2);
10350 break;
10351 case PyUnicode_4BYTE_KIND:
10352 COMPARE(Py_UCS1, Py_UCS4);
10353 break;
10354 default:
10355 Py_UNREACHABLE();
10356 }
10357 396577 break;
10358 }
10359
2/4
✓ Branch 0 taken 37220 times.
✓ Branch 1 taken 2688162 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
2725382 case PyUnicode_2BYTE_KIND:
10360 {
10361 switch(kind2) {
10362 37220 case PyUnicode_1BYTE_KIND:
10363
3/6
✓ Branch 0 taken 37220 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 37220 times.
✓ Branch 4 taken 37220 times.
✗ Branch 5 not taken.
37220 COMPARE(Py_UCS2, Py_UCS1);
10364 break;
10365 2688162 case PyUnicode_2BYTE_KIND:
10366 {
10367
4/6
✓ Branch 0 taken 2688162 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1359406 times.
✓ Branch 3 taken 1328756 times.
✓ Branch 4 taken 2688162 times.
✗ Branch 5 not taken.
2688162 COMPARE(Py_UCS2, Py_UCS2);
10368 break;
10369 }
10370 case PyUnicode_4BYTE_KIND:
10371 COMPARE(Py_UCS2, Py_UCS4);
10372 break;
10373 default:
10374 Py_UNREACHABLE();
10375 }
10376 break;
10377 }
10378
2/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
3 case PyUnicode_4BYTE_KIND:
10379 {
10380 switch(kind2) {
10381 2 case PyUnicode_1BYTE_KIND:
10382
3/6
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
2 COMPARE(Py_UCS4, Py_UCS1);
10383 break;
10384 1 case PyUnicode_2BYTE_KIND:
10385
3/6
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
✓ Branch 4 taken 1 times.
✗ Branch 5 not taken.
1 COMPARE(Py_UCS4, Py_UCS2);
10386 break;
10387 case PyUnicode_4BYTE_KIND:
10388 {
10389 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10390 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10391 /* normalize result of wmemcmp() into the range [-1; 1] */
10392 if (cmp < 0)
10393 return -1;
10394 if (cmp > 0)
10395 return 1;
10396 #else
10397 COMPARE(Py_UCS4, Py_UCS4);
10398 #endif
10399 break;
10400 }
10401 default:
10402 Py_UNREACHABLE();
10403 }
10404 break;
10405 }
10406 default:
10407 Py_UNREACHABLE();
10408 }
10409
10410
2/2
✓ Branch 0 taken 347990 times.
✓ Branch 1 taken 48587 times.
396577 if (len1 == len2)
10411 347990 return 0;
10412
2/2
✓ Branch 0 taken 4299 times.
✓ Branch 1 taken 44288 times.
48587 if (len1 < len2)
10413 4299 return -1;
10414 else
10415 44288 return 1;
10416
10417 #undef COMPARE
10418 }
10419
10420 static int
10421 588451640 unicode_compare_eq(PyObject *str1, PyObject *str2)
10422 {
10423 int kind;
10424 const void *data1, *data2;
10425 Py_ssize_t len;
10426 int cmp;
10427
10428 588451640 len = PyUnicode_GET_LENGTH(str1);
10429
2/2
✓ Branch 1 taken 403540491 times.
✓ Branch 2 taken 184911149 times.
588451640 if (PyUnicode_GET_LENGTH(str2) != len)
10430 403540491 return 0;
10431 184911149 kind = PyUnicode_KIND(str1);
10432
2/2
✓ Branch 0 taken 430082 times.
✓ Branch 1 taken 184481067 times.
184911149 if (PyUnicode_KIND(str2) != kind)
10433 430082 return 0;
10434 184481067 data1 = PyUnicode_DATA(str1);
10435 184481067 data2 = PyUnicode_DATA(str2);
10436
10437 184481067 cmp = memcmp(data1, data2, len * kind);
10438 184481067 return (cmp == 0);
10439 }
10440
10441 int
10442 115344979 _PyUnicode_Equal(PyObject *str1, PyObject *str2)
10443 {
10444 assert(PyUnicode_CheckExact(str1));
10445 assert(PyUnicode_CheckExact(str2));
10446
2/2
✓ Branch 0 taken 43654324 times.
✓ Branch 1 taken 71690655 times.
115344979 if (str1 == str2) {
10447 43654324 return 1;
10448 }
10449 71690655 return unicode_compare_eq(str1, str2);
10450 }
10451
10452
10453 int
10454 638346 PyUnicode_Compare(PyObject *left, PyObject *right)
10455 {
10456
2/4
✓ Branch 2 taken 638346 times.
✗ Branch 3 not taken.
✓ Branch 6 taken 638346 times.
✗ Branch 7 not taken.
638346 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10457 /* a string is equal to itself */
10458
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 638346 times.
638346 if (left == right)
10459 return 0;
10460
10461 638346 return unicode_compare(left, right);
10462 }
10463 PyErr_Format(PyExc_TypeError,
10464 "Can't compare %.100s and %.100s",
10465 Py_TYPE(left)->tp_name,
10466 Py_TYPE(right)->tp_name);
10467 return -1;
10468 }
10469
10470 int
10471 1377143 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10472 {
10473 Py_ssize_t i;
10474 int kind;
10475 Py_UCS4 chr;
10476
10477 assert(_PyUnicode_CHECK(uni));
10478 1377143 kind = PyUnicode_KIND(uni);
10479
1/2
✓ Branch 0 taken 1377143 times.
✗ Branch 1 not taken.
1377143 if (kind == PyUnicode_1BYTE_KIND) {
10480 1377143 const void *data = PyUnicode_1BYTE_DATA(uni);
10481 1377143 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
10482 1377143 size_t len, len2 = strlen(str);
10483 int cmp;
10484
10485 1377143 len = Py_MIN(len1, len2);
10486 1377143 cmp = memcmp(data, str, len);
10487
2/2
✓ Branch 0 taken 1189436 times.
✓ Branch 1 taken 187707 times.
1377143 if (cmp != 0) {
10488
2/2
✓ Branch 0 taken 425759 times.
✓ Branch 1 taken 763677 times.
1189436 if (cmp < 0)
10489 425759 return -1;
10490 else
10491 763677 return 1;
10492 }
10493
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 187707 times.
187707 if (len1 > len2)
10494 return 1; /* uni is longer */
10495
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 187707 times.
187707 if (len1 < len2)
10496 return -1; /* str is longer */
10497 187707 return 0;
10498 }
10499 else {
10500 const void *data = PyUnicode_DATA(uni);
10501 /* Compare Unicode string and source character set string */
10502 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10503 if (chr != (unsigned char)str[i])
10504 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10505 /* This check keeps Python strings that end in '\0' from comparing equal
10506 to C strings identical up to that point. */
10507 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10508 return 1; /* uni is longer */
10509 if (str[i])
10510 return -1; /* str is longer */
10511 return 0;
10512 }
10513 }
10514
10515 int
10516 21314219 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
10517 {
10518 size_t len;
10519 assert(_PyUnicode_CHECK(unicode));
10520 assert(str);
10521 #ifndef NDEBUG
10522 for (const char *p = str; *p; p++) {
10523 assert((unsigned char)*p < 128);
10524 }
10525 #endif
10526
2/2
✓ Branch 1 taken 60 times.
✓ Branch 2 taken 21314159 times.
21314219 if (!PyUnicode_IS_ASCII(unicode))
10527 60 return 0;
10528 21314159 len = (size_t)PyUnicode_GET_LENGTH(unicode);
10529
2/2
✓ Branch 0 taken 1726948 times.
✓ Branch 1 taken 19587211 times.
23041107 return strlen(str) == len &&
10530
2/2
✓ Branch 1 taken 368624 times.
✓ Branch 2 taken 1358324 times.
1726948 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
10531 }
10532
10533 int
10534 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
10535 {
10536 PyObject *right_uni;
10537
10538 assert(_PyUnicode_CHECK(left));
10539 assert(right->string);
10540 #ifndef NDEBUG
10541 for (const char *p = right->string; *p; p++) {
10542 assert((unsigned char)*p < 128);
10543 }
10544 #endif
10545
10546 if (!PyUnicode_IS_ASCII(left))
10547 return 0;
10548
10549 right_uni = _PyUnicode_FromId(right); /* borrowed */
10550 if (right_uni == NULL) {
10551 /* memory error or bad data */
10552 PyErr_Clear();
10553 return _PyUnicode_EqualToASCIIString(left, right->string);
10554 }
10555
10556 if (left == right_uni)
10557 return 1;
10558
10559 if (PyUnicode_CHECK_INTERNED(left))
10560 return 0;
10561
10562 assert(_PyUnicode_HASH(right_uni) != -1);
10563 Py_hash_t hash = _PyUnicode_HASH(left);
10564 if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
10565 return 0;
10566 }
10567
10568 return unicode_compare_eq(left, right_uni);
10569 }
10570
10571 PyObject *
10572 611415797 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10573 {
10574 int result;
10575
10576
3/4
✓ Branch 2 taken 611415797 times.
✗ Branch 3 not taken.
✓ Branch 6 taken 3831772 times.
✓ Branch 7 taken 607584025 times.
611415797 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10577 3831772 Py_RETURN_NOTIMPLEMENTED;
10578
10579
2/2
✓ Branch 0 taken 85495647 times.
✓ Branch 1 taken 522088378 times.
607584025 if (left == right) {
10580
2/3
✓ Branch 0 taken 64996689 times.
✓ Branch 1 taken 20498958 times.
✗ Branch 2 not taken.
85495647 switch (op) {
10581 64996689 case Py_EQ:
10582 case Py_LE:
10583 case Py_GE:
10584 /* a string is equal to itself */
10585 64996689 Py_RETURN_TRUE;
10586 20498958 case Py_NE:
10587 case Py_LT:
10588 case Py_GT:
10589 20498958 Py_RETURN_FALSE;
10590 default:
10591 PyErr_BadArgument();
10592 return NULL;
10593 }
10594 }
10595
4/4
✓ Branch 0 taken 6203289 times.
✓ Branch 1 taken 515885089 times.
✓ Branch 2 taken 875896 times.
✓ Branch 3 taken 5327393 times.
522088378 else if (op == Py_EQ || op == Py_NE) {
10596 516760985 result = unicode_compare_eq(left, right);
10597 516760985 result ^= (op == Py_NE);
10598 516760985 return PyBool_FromLong(result);
10599 }
10600 else {
10601 5327393 result = unicode_compare(left, right);
10602
9/19
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✓ Branch 2 taken 4305259 times.
✓ Branch 3 taken 1016958 times.
✓ Branch 4 taken 5176 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 11 not taken.
✗ Branch 12 not taken.
✓ Branch 15 taken 1773685 times.
✓ Branch 16 taken 2531574 times.
✓ Branch 19 taken 859624 times.
✓ Branch 20 taken 157334 times.
✓ Branch 23 taken 5169 times.
✓ Branch 24 taken 7 times.
✗ Branch 27 not taken.
✗ Branch 28 not taken.
5327393 Py_RETURN_RICHCOMPARE(result, 0, op);
10603 }
10604 }
10605
10606 int
10607 5630095 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
10608 {
10609 5630095 return unicode_eq(aa, bb);
10610 }
10611
10612 int
10613 152484002 PyUnicode_Contains(PyObject *str, PyObject *substr)
10614 {
10615 int kind1, kind2;
10616 const void *buf1, *buf2;
10617 Py_ssize_t len1, len2;
10618 int result;
10619
10620
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 152484002 times.
152484002 if (!PyUnicode_Check(substr)) {
10621 PyErr_Format(PyExc_TypeError,
10622 "'in <string>' requires string as left operand, not %.100s",
10623 Py_TYPE(substr)->tp_name);
10624 return -1;
10625 }
10626
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 152484002 times.
152484002 if (ensure_unicode(str) < 0)
10627 return -1;
10628
10629 152484002 kind1 = PyUnicode_KIND(str);
10630 152484002 kind2 = PyUnicode_KIND(substr);
10631
2/2
✓ Branch 0 taken 59184 times.
✓ Branch 1 taken 152424818 times.
152484002 if (kind1 < kind2)
10632 59184 return 0;
10633 152424818 len1 = PyUnicode_GET_LENGTH(str);
10634 152424818 len2 = PyUnicode_GET_LENGTH(substr);
10635
2/2
✓ Branch 0 taken 81659649 times.
✓ Branch 1 taken 70765169 times.
152424818 if (len1 < len2)
10636 81659649 return 0;
10637 70765169 buf1 = PyUnicode_DATA(str);
10638 70765169 buf2 = PyUnicode_DATA(substr);
10639
2/2
✓ Branch 0 taken 69767737 times.
✓ Branch 1 taken 997432 times.
70765169 if (len2 == 1) {
10640 69767737 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
10641 69767737 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
10642 69767737 return result;
10643 }
10644
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 997432 times.
997432 if (kind2 != kind1) {
10645 buf2 = unicode_askind(kind2, buf2, len2, kind1);
10646 if (!buf2)
10647 return -1;
10648 }
10649
10650
1/4
✓ Branch 0 taken 997432 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
997432 switch (kind1) {
10651 997432 case PyUnicode_1BYTE_KIND:
10652 997432 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10653 997432 break;
10654 case PyUnicode_2BYTE_KIND:
10655 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10656 break;
10657 case PyUnicode_4BYTE_KIND:
10658 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10659 break;
10660 default:
10661 Py_UNREACHABLE();
10662 }
10663
10664 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
10665
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 997432 times.
997432 if (kind2 != kind1)
10666 PyMem_Free((void *)buf2);
10667
10668 997432 return result;
10669 }
10670
10671 /* Concat to string or Unicode object giving a new Unicode object. */
10672
10673 PyObject *
10674 48331141 PyUnicode_Concat(PyObject *left, PyObject *right)
10675 {
10676 PyObject *result;
10677 Py_UCS4 maxchar, maxchar2;
10678 Py_ssize_t left_len, right_len, new_len;
10679
10680
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 48331141 times.
48331141 if (ensure_unicode(left) < 0)
10681 return NULL;
10682
10683
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 48331141 times.
48331141 if (!PyUnicode_Check(right)) {
10684 PyErr_Format(PyExc_TypeError,
10685 "can only concatenate str (not \"%.200s\") to str",
10686 Py_TYPE(right)->tp_name);
10687 return NULL;
10688 }
10689
10690 /* Shortcuts */
10691 48331141 PyObject *empty = unicode_get_empty(); // Borrowed reference
10692
2/2
✓ Branch 0 taken 1483811 times.
✓ Branch 1 taken 46847330 times.
48331141 if (left == empty) {
10693 1483811 return PyUnicode_FromObject(right);
10694 }
10695
2/2
✓ Branch 0 taken 417729 times.
✓ Branch 1 taken 46429601 times.
46847330 if (right == empty) {
10696 417729 return PyUnicode_FromObject(left);
10697 }
10698
10699 46429601 left_len = PyUnicode_GET_LENGTH(left);
10700 46429601 right_len = PyUnicode_GET_LENGTH(right);
10701
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 46429601 times.
46429601 if (left_len > PY_SSIZE_T_MAX - right_len) {
10702 PyErr_SetString(PyExc_OverflowError,
10703 "strings are too large to concat");
10704 return NULL;
10705 }
10706 46429601 new_len = left_len + right_len;
10707
10708 46429601 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10709 46429601 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10710 46429601 maxchar = Py_MAX(maxchar, maxchar2);
10711
10712 /* Concat the two Unicode strings */
10713 46429601 result = PyUnicode_New(new_len, maxchar);
10714
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 46429601 times.
46429601 if (result == NULL)
10715 return NULL;
10716 46429601 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
10717 46429601 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
10718 assert(_PyUnicode_CheckConsistency(result, 1));
10719 46429601 return result;
10720 }
10721
10722 void
10723 2979009 PyUnicode_Append(PyObject **p_left, PyObject *right)
10724 {
10725 PyObject *left, *res;
10726 Py_UCS4 maxchar, maxchar2;
10727 Py_ssize_t left_len, right_len, new_len;
10728
10729
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2979009 times.
2979009 if (p_left == NULL) {
10730 if (!PyErr_Occurred())
10731 PyErr_BadInternalCall();
10732 return;
10733 }
10734 2979009 left = *p_left;
10735
2/4
✓ Branch 0 taken 2979009 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2979009 times.
✗ Branch 3 not taken.
2979009 if (right == NULL || left == NULL
10736
2/4
✓ Branch 2 taken 2979009 times.
✗ Branch 3 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2979009 times.
2979009 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
10737 if (!PyErr_Occurred())
10738 PyErr_BadInternalCall();
10739 goto error;
10740 }
10741
10742 /* Shortcuts */
10743 2979009 PyObject *empty = unicode_get_empty(); // Borrowed reference
10744
2/2
✓ Branch 0 taken 569260 times.
✓ Branch 1 taken 2409749 times.
2979009 if (left == empty) {
10745 569260 Py_DECREF(left);
10746 569260 Py_INCREF(right);
10747 569260 *p_left = right;
10748 569260 return;
10749 }
10750
2/2
✓ Branch 0 taken 4369 times.
✓ Branch 1 taken 2405380 times.
2409749 if (right == empty) {
10751 4369 return;
10752 }
10753
10754 2405380 left_len = PyUnicode_GET_LENGTH(left);
10755 2405380 right_len = PyUnicode_GET_LENGTH(right);
10756
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2405380 times.
2405380 if (left_len > PY_SSIZE_T_MAX - right_len) {
10757 PyErr_SetString(PyExc_OverflowError,
10758 "strings are too large to concat");
10759 goto error;
10760 }
10761 2405380 new_len = left_len + right_len;
10762
10763
2/2
✓ Branch 1 taken 541679 times.
✓ Branch 2 taken 1863701 times.
2405380 if (unicode_modifiable(left)
10764
1/2
✓ Branch 1 taken 541679 times.
✗ Branch 2 not taken.
541679 && PyUnicode_CheckExact(right)
10765
2/2
✓ Branch 0 taken 541659 times.
✓ Branch 1 taken 20 times.
541679 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
10766 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10767 to change the structure size, but characters are stored just after
10768 the structure, and so it requires to move all characters which is
10769 not so different than duplicating the string. */
10770
4/4
✓ Branch 1 taken 541134 times.
✓ Branch 2 taken 525 times.
✓ Branch 4 taken 541130 times.
✓ Branch 5 taken 4 times.
541659 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10771 {
10772 /* append inplace */
10773
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 541655 times.
541655 if (unicode_resize(p_left, new_len) != 0)
10774 goto error;
10775
10776 /* copy 'right' into the newly allocated area of 'left' */
10777 541655 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
10778 }
10779 else {
10780 1863725 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10781 1863725 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10782 1863725 maxchar = Py_MAX(maxchar, maxchar2);
10783
10784 /* Concat the two Unicode strings */
10785 1863725 res = PyUnicode_New(new_len, maxchar);
10786
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1863725 times.
1863725 if (res == NULL)
10787 goto error;
10788 1863725 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10789 1863725 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
10790 1863725 Py_DECREF(left);
10791 1863725 *p_left = res;
10792 }
10793 assert(_PyUnicode_CheckConsistency(*p_left, 1));
10794 2405380 return;
10795
10796 error:
10797 Py_CLEAR(*p_left);
10798 }
10799
10800 void
10801 19870 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10802 {
10803 19870 PyUnicode_Append(pleft, right);
10804 19870 Py_XDECREF(right);
10805 19870 }
10806
10807 /*
10808 Wraps asciilib_parse_args_finds() and additionally ensures that the
10809 first argument is a unicode object.
10810 */
10811
10812 static inline int
10813 2590908 parse_args_finds_unicode(const char * function_name, PyObject *args,
10814 PyObject **substring,
10815 Py_ssize_t *start, Py_ssize_t *end)
10816 {
10817
1/2
✓ Branch 1 taken 2590908 times.
✗ Branch 2 not taken.
2590908 if (asciilib_parse_args_finds(function_name, args, substring, start, end)) {
10818
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 2590908 times.
2590908 if (ensure_unicode(*substring) < 0)
10819 return 0;
10820 2590908 return 1;
10821 }
10822 return 0;
10823 }
10824
10825 PyDoc_STRVAR(count__doc__,
10826 "S.count(sub[, start[, end]]) -> int\n\
10827 \n\
10828 Return the number of non-overlapping occurrences of substring sub in\n\
10829 string S[start:end]. Optional arguments start and end are\n\
10830 interpreted as in slice notation.");
10831
10832 static PyObject *
10833 62281 unicode_count(PyObject *self, PyObject *args)
10834 {
10835 62281 PyObject *substring = NULL; /* initialize to fix a compiler warning */
10836 62281 Py_ssize_t start = 0;
10837 62281 Py_ssize_t end = PY_SSIZE_T_MAX;
10838 PyObject *result;
10839 int kind1, kind2;
10840 const void *buf1, *buf2;
10841 Py_ssize_t len1, len2, iresult;
10842
10843
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 62281 times.
62281 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
10844 return NULL;
10845
10846 62281 kind1 = PyUnicode_KIND(self);
10847 62281 kind2 = PyUnicode_KIND(substring);
10848
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 62281 times.
62281 if (kind1 < kind2)
10849 return PyLong_FromLong(0);
10850
10851 62281 len1 = PyUnicode_GET_LENGTH(self);
10852 62281 len2 = PyUnicode_GET_LENGTH(substring);
10853
4/10
✓ Branch 0 taken 3749 times.
✓ Branch 1 taken 58532 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 58532 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 62281 times.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
62281 ADJUST_INDICES(start, end, len1);
10854
2/2
✓ Branch 0 taken 106 times.
✓ Branch 1 taken 62175 times.
62281 if (end - start < len2)
10855 106 return PyLong_FromLong(0);
10856
10857 62175 buf1 = PyUnicode_DATA(self);
10858 62175 buf2 = PyUnicode_DATA(substring);
10859
2/2
✓ Branch 0 taken 138 times.
✓ Branch 1 taken 62037 times.
62175 if (kind2 != kind1) {
10860 138 buf2 = unicode_askind(kind2, buf2, len2, kind1);
10861
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 138 times.
138 if (!buf2)
10862 return NULL;
10863 }
10864
2/4
✓ Branch 0 taken 62037 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 138 times.
✗ Branch 3 not taken.
62175 switch (kind1) {
10865 62037 case PyUnicode_1BYTE_KIND:
10866 62037 iresult = ucs1lib_count(
10867 ((const Py_UCS1*)buf1) + start, end - start,
10868 buf2, len2, PY_SSIZE_T_MAX
10869 );
10870 62037 break;
10871 case PyUnicode_2BYTE_KIND:
10872 iresult = ucs2lib_count(
10873 ((const Py_UCS2*)buf1) + start, end - start,
10874 buf2, len2, PY_SSIZE_T_MAX
10875 );
10876 break;
10877 138 case PyUnicode_4BYTE_KIND:
10878 138 iresult = ucs4lib_count(
10879 138 ((const Py_UCS4*)buf1) + start, end - start,
10880 buf2, len2, PY_SSIZE_T_MAX
10881 );
10882 138 break;
10883 default:
10884 Py_UNREACHABLE();
10885 }
10886
10887 62175 result = PyLong_FromSsize_t(iresult);
10888
10889 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
10890
2/2
✓ Branch 0 taken 138 times.
✓ Branch 1 taken 62037 times.
62175 if (kind2 != kind1)
10891 138 PyMem_Free((void *)buf2);
10892
10893 62175 return result;
10894 }
10895
10896 /*[clinic input]
10897 str.encode as unicode_encode
10898
10899 encoding: str(c_default="NULL") = 'utf-8'
10900 The encoding in which to encode the string.
10901 errors: str(c_default="NULL") = 'strict'
10902 The error handling scheme to use for encoding errors.
10903 The default is 'strict' meaning that encoding errors raise a
10904 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
10905 'xmlcharrefreplace' as well as any other name registered with
10906 codecs.register_error that can handle UnicodeEncodeErrors.
10907
10908 Encode the string using the codec registered for encoding.
10909 [clinic start generated code]*/
10910
10911 static PyObject *
10912 2003701 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
10913 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
10914 {
10915 2003701 return PyUnicode_AsEncodedString(self, encoding, errors);
10916 }
10917
10918 /*[clinic input]
10919 str.expandtabs as unicode_expandtabs
10920
10921 tabsize: int = 8
10922
10923 Return a copy where all tab characters are expanded using spaces.
10924
10925 If tabsize is not given, a tab size of 8 characters is assumed.
10926 [clinic start generated code]*/
10927
10928 static PyObject *
10929 71991 unicode_expandtabs_impl(PyObject *self, int tabsize)
10930 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
10931 {
10932 Py_ssize_t i, j, line_pos, src_len, incr;
10933 Py_UCS4 ch;
10934 PyObject *u;
10935 const void *src_data;
10936 void *dest_data;
10937 int kind;
10938 int found;
10939
10940 /* First pass: determine size of output string */
10941 71991 src_len = PyUnicode_GET_LENGTH(self);
10942 71991 i = j = line_pos = 0;
10943 71991 kind = PyUnicode_KIND(self);
10944 71991 src_data = PyUnicode_DATA(self);
10945 71991 found = 0;
10946
2/2
✓ Branch 0 taken 2344751 times.
✓ Branch 1 taken 71991 times.
2416742 for (; i < src_len; i++) {
10947 2344751 ch = PyUnicode_READ(kind, src_data, i);
10948
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2344751 times.
2344751 if (ch == '\t') {
10949 found = 1;
10950 if (tabsize > 0) {
10951 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10952 if (j > PY_SSIZE_T_MAX - incr)
10953 goto overflow;
10954 line_pos += incr;
10955 j += incr;
10956 }
10957 }
10958 else {
10959
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2344751 times.
2344751 if (j > PY_SSIZE_T_MAX - 1)
10960 goto overflow;
10961 2344751 line_pos++;
10962 2344751 j++;
10963
2/4
✓ Branch 0 taken 2344751 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2344751 times.
2344751 if (ch == '\n' || ch == '\r')
10964 line_pos = 0;
10965 }
10966 }
10967
1/2
✓ Branch 0 taken 71991 times.
✗ Branch 1 not taken.
71991 if (!found)
10968 71991 return unicode_result_unchanged(self);
10969
10970 /* Second pass: create output string and fill it */
10971 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
10972 if (!u)
10973 return NULL;
10974 dest_data = PyUnicode_DATA(u);
10975
10976 i = j = line_pos = 0;
10977
10978 for (; i < src_len; i++) {
10979 ch = PyUnicode_READ(kind, src_data, i);
10980 if (ch == '\t') {
10981 if (tabsize > 0) {
10982 incr = tabsize - (line_pos % tabsize);
10983 line_pos += incr;
10984 unicode_fill(kind, dest_data, ' ', j, incr);
10985 j += incr;
10986 }
10987 }
10988 else {
10989 line_pos++;
10990 PyUnicode_WRITE(kind, dest_data, j, ch);
10991 j++;
10992 if (ch == '\n' || ch == '\r')
10993 line_pos = 0;
10994 }
10995 }
10996 assert (j == PyUnicode_GET_LENGTH(u));
10997 return unicode_result(u);
10998
10999 overflow:
11000 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11001 return NULL;
11002 }
11003
11004 PyDoc_STRVAR(find__doc__,
11005 "S.find(sub[, start[, end]]) -> int\n\
11006 \n\
11007 Return the lowest index in S where substring sub is found,\n\
11008 such that sub is contained within S[start:end]. Optional\n\
11009 arguments start and end are interpreted as in slice notation.\n\
11010 \n\
11011 Return -1 on failure.");
11012
11013 static PyObject *
11014 813091 unicode_find(PyObject *self, PyObject *args)
11015 {
11016 /* initialize variables to prevent gcc warning */
11017 813091 PyObject *substring = NULL;
11018 813091 Py_ssize_t start = 0;
11019 813091 Py_ssize_t end = 0;
11020 Py_ssize_t result;
11021
11022
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 813091 times.
813091 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11023 return NULL;
11024
11025 813091 result = any_find_slice(self, substring, start, end, 1);
11026
11027
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 813091 times.
813091 if (result == -2)
11028 return NULL;
11029
11030 813091 return PyLong_FromSsize_t(result);
11031 }
11032
11033 static PyObject *
11034 55122851 unicode_getitem(PyObject *self, Py_ssize_t index)
11035 {
11036 const void *data;
11037 int kind;
11038 Py_UCS4 ch;
11039
11040
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 55122851 times.
55122851 if (!PyUnicode_Check(self)) {
11041 PyErr_BadArgument();
11042 return NULL;
11043 }
11044
3/4
✓ Branch 0 taken 55122851 times.
✗ Branch 1 not taken.
✓ Branch 3 taken 558181 times.
✓ Branch 4 taken 54564670 times.
55122851 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11045 558181 PyErr_SetString(PyExc_IndexError, "string index out of range");
11046 558181 return NULL;
11047 }
11048 54564670 kind = PyUnicode_KIND(self);
11049 54564670 data = PyUnicode_DATA(self);
11050 54564670 ch = PyUnicode_READ(kind, data, index);
11051 54564670 return unicode_char(ch);
11052 }
11053
11054 /* Believe it or not, this produces the same value for ASCII strings
11055 as bytes_hash(). */
11056 static Py_hash_t
11057 498036273 unicode_hash(PyObject *self)
11058 {
11059 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
11060
11061 #ifdef Py_DEBUG
11062 assert(_Py_HashSecret_Initialized);
11063 #endif
11064
2/2
✓ Branch 0 taken 363178854 times.
✓ Branch 1 taken 134857419 times.
498036273 if (_PyUnicode_HASH(self) != -1)
11065 363178854 return _PyUnicode_HASH(self);
11066
11067 134857419 x = _Py_HashBytes(PyUnicode_DATA(self),
11068 134857419 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11069 134857419 _PyUnicode_HASH(self) = x;
11070 134857419 return x;
11071 }
11072
11073 PyDoc_STRVAR(index__doc__,
11074 "S.index(sub[, start[, end]]) -> int\n\
11075 \n\
11076 Return the lowest index in S where substring sub is found,\n\
11077 such that sub is contained within S[start:end]. Optional\n\
11078 arguments start and end are interpreted as in slice notation.\n\
11079 \n\
11080 Raises ValueError when the substring is not found.");
11081
11082 static PyObject *
11083 8035 unicode_index(PyObject *self, PyObject *args)
11084 {
11085 /* initialize variables to prevent gcc warning */
11086 Py_ssize_t result;
11087 8035 PyObject *substring = NULL;
11088 8035 Py_ssize_t start = 0;
11089 8035 Py_ssize_t end = 0;
11090
11091
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 8035 times.
8035 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11092 return NULL;
11093
11094 8035 result = any_find_slice(self, substring, start, end, 1);
11095
11096
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 8035 times.
8035 if (result == -2)
11097 return NULL;
11098
11099
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8027 times.
8035 if (result < 0) {
11100 8 PyErr_SetString(PyExc_ValueError, "substring not found");
11101 8 return NULL;
11102 }
11103
11104 8027 return PyLong_FromSsize_t(result);
11105 }
11106
11107 /*[clinic input]
11108 str.isascii as unicode_isascii
11109
11110 Return True if all characters in the string are ASCII, False otherwise.
11111
11112 ASCII characters have code points in the range U+0000-U+007F.
11113 Empty string is ASCII too.
11114 [clinic start generated code]*/
11115
11116 static PyObject *
11117 106759 unicode_isascii_impl(PyObject *self)
11118 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11119 {
11120 106759 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11121 }
11122
11123 /*[clinic input]
11124 str.islower as unicode_islower
11125
11126 Return True if the string is a lowercase string, False otherwise.
11127
11128 A string is lowercase if all cased characters in the string are lowercase and
11129 there is at least one cased character in the string.
11130 [clinic start generated code]*/
11131
11132 static PyObject *
11133 95190 unicode_islower_impl(PyObject *self)
11134 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11135 {
11136 Py_ssize_t i, length;
11137 int kind;
11138 const void *data;
11139 int cased;
11140
11141 95190 length = PyUnicode_GET_LENGTH(self);
11142 95190 kind = PyUnicode_KIND(self);
11143 95190 data = PyUnicode_DATA(self);
11144
11145 /* Shortcut for single character strings */
11146
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 95190 times.
95190 if (length == 1)
11147 return PyBool_FromLong(
11148 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11149
11150 /* Special case for empty strings */
11151
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 95190 times.
95190 if (length == 0)
11152 Py_RETURN_FALSE;
11153
11154 95190 cased = 0;
11155
2/2
✓ Branch 0 taken 610490 times.
✓ Branch 1 taken 81804 times.
692294 for (i = 0; i < length; i++) {
11156 610490 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11157
11158
3/4
✓ Branch 1 taken 597104 times.
✓ Branch 2 taken 13386 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 597104 times.
610490 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11159 13386 Py_RETURN_FALSE;
11160
3/4
✓ Branch 0 taken 84978 times.
✓ Branch 1 taken 512126 times.
✓ Branch 3 taken 84978 times.
✗ Branch 4 not taken.
597104 else if (!cased && Py_UNICODE_ISLOWER(ch))
11161 84978 cased = 1;
11162 }
11163 81804 return PyBool_FromLong(cased);
11164 }
11165
11166 /*[clinic input]
11167 str.isupper as unicode_isupper
11168
11169 Return True if the string is an uppercase string, False otherwise.
11170
11171 A string is uppercase if all cased characters in the string are uppercase and
11172 there is at least one cased character in the string.
11173 [clinic start generated code]*/
11174
11175 static PyObject *
11176 3172563 unicode_isupper_impl(PyObject *self)
11177 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11178 {
11179 Py_ssize_t i, length;
11180 int kind;
11181 const void *data;
11182 int cased;
11183
11184 3172563 length = PyUnicode_GET_LENGTH(self);
11185 3172563 kind = PyUnicode_KIND(self);
11186 3172563 data = PyUnicode_DATA(self);
11187
11188 /* Shortcut for single character strings */
11189
2/2
✓ Branch 0 taken 17941 times.
✓ Branch 1 taken 3154622 times.
3172563 if (length == 1)
11190 17941 return PyBool_FromLong(
11191 17941 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11192
11193 /* Special case for empty strings */
11194
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3154622 times.
3154622 if (length == 0)
11195 Py_RETURN_FALSE;
11196
11197 3154622 cased = 0;
11198
2/2
✓ Branch 0 taken 34956757 times.
✓ Branch 1 taken 2644996 times.
37601753 for (i = 0; i < length; i++) {
11199 34956757 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11200
11201
3/4
✓ Branch 1 taken 34447131 times.
✓ Branch 2 taken 509626 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 34447131 times.
34956757 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11202 509626 Py_RETURN_FALSE;
11203
4/4
✓ Branch 0 taken 2873007 times.
✓ Branch 1 taken 31574124 times.
✓ Branch 3 taken 2689752 times.
✓ Branch 4 taken 183255 times.
34447131 else if (!cased && Py_UNICODE_ISUPPER(ch))
11204 2689752 cased = 1;
11205 }
11206 2644996 return PyBool_FromLong(cased);
11207 }
11208
11209 /*[clinic input]
11210 str.istitle as unicode_istitle
11211
11212 Return True if the string is a title-cased string, False otherwise.
11213
11214 In a title-cased string, upper- and title-case characters may only
11215 follow uncased characters and lowercase characters only cased ones.
11216 [clinic start generated code]*/
11217
11218 static PyObject *
11219 unicode_istitle_impl(PyObject *self)
11220 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11221 {
11222 Py_ssize_t i, length;
11223 int kind;
11224 const void *data;
11225 int cased, previous_is_cased;
11226
11227 length = PyUnicode_GET_LENGTH(self);
11228 kind = PyUnicode_KIND(self);
11229 data = PyUnicode_DATA(self);
11230
11231 /* Shortcut for single character strings */
11232 if (length == 1) {
11233 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11234 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11235 (Py_UNICODE_ISUPPER(ch) != 0));
11236 }
11237
11238 /* Special case for empty strings */
11239 if (length == 0)
11240 Py_RETURN_FALSE;
11241
11242 cased = 0;
11243 previous_is_cased = 0;
11244 for (i = 0; i < length; i++) {
11245 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11246
11247 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11248 if (previous_is_cased)
11249 Py_RETURN_FALSE;
11250 previous_is_cased = 1;
11251 cased = 1;
11252 }
11253 else if (Py_UNICODE_ISLOWER(ch)) {
11254 if (!previous_is_cased)
11255 Py_RETURN_FALSE;
11256 previous_is_cased = 1;
11257 cased = 1;
11258 }
11259 else
11260 previous_is_cased = 0;
11261 }
11262 return PyBool_FromLong(cased);
11263 }
11264
11265 /*[clinic input]
11266 str.isspace as unicode_isspace
11267
11268 Return True if the string is a whitespace string, False otherwise.
11269
11270 A string is whitespace if all characters in the string are whitespace and there
11271 is at least one character in the string.
11272 [clinic start generated code]*/
11273
11274 static PyObject *
11275 113704 unicode_isspace_impl(PyObject *self)
11276 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
11277 {
11278 Py_ssize_t i, length;
11279 int kind;
11280 const void *data;
11281
11282 113704 length = PyUnicode_GET_LENGTH(self);
11283 113704 kind = PyUnicode_KIND(self);
11284 113704 data = PyUnicode_DATA(self);
11285
11286 /* Shortcut for single character strings */
11287
1/2
✓ Branch 0 taken 113704 times.
✗ Branch 1 not taken.
113704 if (length == 1)
11288 113704 return PyBool_FromLong(
11289 113704 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11290
11291 /* Special case for empty strings */
11292 if (length == 0)
11293 Py_RETURN_FALSE;
11294
11295 for (i = 0; i < length; i++) {
11296 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11297 if (!Py_UNICODE_ISSPACE(ch))
11298 Py_RETURN_FALSE;
11299 }
11300 Py_RETURN_TRUE;
11301 }
11302
11303 /*[clinic input]
11304 str.isalpha as unicode_isalpha
11305
11306 Return True if the string is an alphabetic string, False otherwise.
11307
11308 A string is alphabetic if all characters in the string are alphabetic and there
11309 is at least one character in the string.
11310 [clinic start generated code]*/
11311
11312 static PyObject *
11313 523527 unicode_isalpha_impl(PyObject *self)
11314 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
11315 {
11316 Py_ssize_t i, length;
11317 int kind;
11318 const void *data;
11319
11320 523527 length = PyUnicode_GET_LENGTH(self);
11321 523527 kind = PyUnicode_KIND(self);
11322 523527 data = PyUnicode_DATA(self);
11323
11324 /* Shortcut for single character strings */
11325
1/2
✓ Branch 0 taken 523527 times.
✗ Branch 1 not taken.
523527 if (length == 1)
11326 523527 return PyBool_FromLong(
11327 523527 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11328
11329 /* Special case for empty strings */
11330 if (length == 0)
11331 Py_RETURN_FALSE;
11332
11333 for (i = 0; i < length; i++) {
11334 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11335 Py_RETURN_FALSE;
11336 }
11337 Py_RETURN_TRUE;
11338 }
11339
11340 /*[clinic input]
11341 str.isalnum as unicode_isalnum
11342
11343 Return True if the string is an alpha-numeric string, False otherwise.
11344
11345 A string is alpha-numeric if all characters in the string are alpha-numeric and
11346 there is at least one character in the string.
11347 [clinic start generated code]*/
11348
11349 static PyObject *
11350 53572 unicode_isalnum_impl(PyObject *self)
11351 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
11352 {
11353 int kind;
11354 const void *data;
11355 Py_ssize_t len, i;
11356
11357 53572 kind = PyUnicode_KIND(self);
11358 53572 data = PyUnicode_DATA(self);
11359 53572 len = PyUnicode_GET_LENGTH(self);
11360
11361 /* Shortcut for single character strings */
11362
1/2
✓ Branch 0 taken 53572 times.
✗ Branch 1 not taken.
53572 if (len == 1) {
11363 53572 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11364 53572 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11365 }
11366
11367 /* Special case for empty strings */
11368 if (len == 0)
11369 Py_RETURN_FALSE;
11370
11371 for (i = 0; i < len; i++) {
11372 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11373 if (!Py_UNICODE_ISALNUM(ch))
11374 Py_RETURN_FALSE;
11375 }
11376 Py_RETURN_TRUE;
11377 }
11378
11379 /*[clinic input]
11380 str.isdecimal as unicode_isdecimal
11381
11382 Return True if the string is a decimal string, False otherwise.
11383
11384 A string is a decimal string if all characters in the string are decimal and
11385 there is at least one character in the string.
11386 [clinic start generated code]*/
11387
11388 static PyObject *
11389 468 unicode_isdecimal_impl(PyObject *self)
11390 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
11391 {
11392 Py_ssize_t i, length;
11393 int kind;
11394 const void *data;
11395
11396 468 length = PyUnicode_GET_LENGTH(self);
11397 468 kind = PyUnicode_KIND(self);
11398 468 data = PyUnicode_DATA(self);
11399
11400 /* Shortcut for single character strings */
11401
1/2
✓ Branch 0 taken 468 times.
✗ Branch 1 not taken.
468 if (length == 1)
11402 468 return PyBool_FromLong(
11403 468 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11404
11405 /* Special case for empty strings */
11406 if (length == 0)
11407 Py_RETURN_FALSE;
11408
11409 for (i = 0; i < length; i++) {
11410 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11411 Py_RETURN_FALSE;
11412 }
11413 Py_RETURN_TRUE;
11414 }
11415
11416 /*[clinic input]
11417 str.isdigit as unicode_isdigit
11418
11419 Return True if the string is a digit string, False otherwise.
11420
11421 A string is a digit string if all characters in the string are digits and there
11422 is at least one character in the string.
11423 [clinic start generated code]*/
11424
11425 static PyObject *
11426 469008 unicode_isdigit_impl(PyObject *self)
11427 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
11428 {
11429 Py_ssize_t i, length;
11430 int kind;
11431 const void *data;
11432
11433 469008 length = PyUnicode_GET_LENGTH(self);
11434 469008 kind = PyUnicode_KIND(self);
11435 469008 data = PyUnicode_DATA(self);
11436
11437 /* Shortcut for single character strings */
11438
2/2
✓ Branch 0 taken 454418 times.
✓ Branch 1 taken 14590 times.
469008 if (length == 1) {
11439 454418 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11440 454418 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11441 }
11442
11443 /* Special case for empty strings */
11444
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14590 times.
14590 if (length == 0)
11445 Py_RETURN_FALSE;
11446
11447
2/2
✓ Branch 0 taken 36075 times.
✓ Branch 1 taken 14590 times.
50665 for (i = 0; i < length; i++) {
11448
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 36075 times.
36075 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11449 Py_RETURN_FALSE;
11450 }
11451 14590 Py_RETURN_TRUE;
11452 }
11453
11454 /*[clinic input]
11455 str.isnumeric as unicode_isnumeric
11456
11457 Return True if the string is a numeric string, False otherwise.
11458
11459 A string is numeric if all characters in the string are numeric and there is at
11460 least one character in the string.
11461 [clinic start generated code]*/
11462
11463 static PyObject *
11464 unicode_isnumeric_impl(PyObject *self)
11465 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
11466 {
11467 Py_ssize_t i, length;
11468 int kind;
11469 const void *data;
11470
11471 length = PyUnicode_GET_LENGTH(self);
11472 kind = PyUnicode_KIND(self);
11473 data = PyUnicode_DATA(self);
11474
11475 /* Shortcut for single character strings */
11476 if (length == 1)
11477 return PyBool_FromLong(
11478 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11479
11480 /* Special case for empty strings */
11481 if (length == 0)
11482 Py_RETURN_FALSE;
11483
11484 for (i = 0; i < length; i++) {
11485 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11486 Py_RETURN_FALSE;
11487 }
11488 Py_RETURN_TRUE;
11489 }
11490
11491 Py_ssize_t
11492 1781013 _PyUnicode_ScanIdentifier(PyObject *self)
11493 {
11494 Py_ssize_t i;
11495 1781013 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
11496
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1781013 times.
1781013 if (len == 0) {
11497 /* an empty string is not a valid identifier */
11498 return 0;
11499 }
11500
11501 1781013 int kind = PyUnicode_KIND(self);
11502 1781013 const void *data = PyUnicode_DATA(self);
11503 1781013 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11504 /* PEP 3131 says that the first character must be in
11505 XID_Start and subsequent characters in XID_Continue,
11506 and for the ASCII range, the 2.x rules apply (i.e
11507 start with letters and underscore, continue with
11508 letters, digits, underscore). However, given the current
11509 definition of XID_Start and XID_Continue, it is sufficient
11510 to check just for these, except that _ must be allowed
11511 as starting an identifier. */
11512
4/4
✓ Branch 1 taken 647184 times.
✓ Branch 2 taken 1133829 times.
✓ Branch 3 taken 420610 times.
✓ Branch 4 taken 226574 times.
1781013 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
11513 420610 return 0;
11514 }
11515
11516
2/2
✓ Branch 0 taken 5693770 times.
✓ Branch 1 taken 1342549 times.
7036319 for (i = 1; i < len; i++) {
11517 5693770 ch = PyUnicode_READ(kind, data, i);
11518
2/2
✓ Branch 1 taken 17854 times.
✓ Branch 2 taken 5675916 times.
5693770 if (!_PyUnicode_IsXidContinue(ch)) {
11519 17854 return i;
11520 }
11521 }
11522 1342549 return i;
11523 }
11524
11525 int
11526 1780953 PyUnicode_IsIdentifier(PyObject *self)
11527 {
11528 1780953 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
11529 1780953 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
11530 /* an empty string is not a valid identifier */
11531
3/4
✓ Branch 0 taken 1780953 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1342489 times.
✓ Branch 3 taken 438464 times.
1780953 return len && i == len;
11532 }
11533
11534 /*[clinic input]
11535 str.isidentifier as unicode_isidentifier
11536
11537 Return True if the string is a valid Python identifier, False otherwise.
11538
11539 Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
11540 such as "def" or "class".
11541 [clinic start generated code]*/
11542
11543 static PyObject *
11544 1529669 unicode_isidentifier_impl(PyObject *self)
11545 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
11546 {
11547 1529669 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11548 }
11549
11550 /*[clinic input]
11551 str.isprintable as unicode_isprintable
11552
11553 Return True if the string is printable, False otherwise.
11554
11555 A string is printable if all of its characters are considered printable in
11556 repr() or if it is empty.
11557 [clinic start generated code]*/
11558
11559 static PyObject *
11560 unicode_isprintable_impl(PyObject *self)
11561 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
11562 {
11563 Py_ssize_t i, length;
11564 int kind;
11565 const void *data;
11566
11567 length = PyUnicode_GET_LENGTH(self);
11568 kind = PyUnicode_KIND(self);
11569 data = PyUnicode_DATA(self);
11570
11571 /* Shortcut for single character strings */
11572 if (length == 1)
11573 return PyBool_FromLong(
11574 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11575
11576 for (i = 0; i < length; i++) {
11577 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11578 Py_RETURN_FALSE;
11579 }
11580 }
11581 Py_RETURN_TRUE;
11582 }
11583
11584 /*[clinic input]
11585 str.join as unicode_join
11586
11587 iterable: object
11588 /
11589
11590 Concatenate any number of strings.
11591
11592 The string whose method is called is inserted in between each given string.
11593 The result is returned as a new string.
11594
11595 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
11596 [clinic start generated code]*/
11597
11598 static PyObject *
11599 7413707 unicode_join(PyObject *self, PyObject *iterable)
11600 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
11601 {
11602 7413707 return PyUnicode_Join(self, iterable);
11603 }
11604
11605 static Py_ssize_t
11606 64853817 unicode_length(PyObject *self)
11607 {
11608 64853817 return PyUnicode_GET_LENGTH(self);
11609 }
11610
11611 /*[clinic input]
11612 str.ljust as unicode_ljust
11613
11614 width: Py_ssize_t
11615 fillchar: Py_UCS4 = ' '
11616 /
11617
11618 Return a left-justified string of length width.
11619
11620 Padding is done using the specified fill character (default is a space).
11621 [clinic start generated code]*/
11622
11623 static PyObject *
11624 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11625 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
11626 {
11627 if (PyUnicode_GET_LENGTH(self) >= width)
11628 return unicode_result_unchanged(self);
11629
11630 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
11631 }
11632
11633 /*[clinic input]
11634 str.lower as unicode_lower
11635
11636 Return a copy of the string converted to lowercase.
11637 [clinic start generated code]*/
11638
11639 static PyObject *
11640 2472891 unicode_lower_impl(PyObject *self)
11641 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
11642 {
11643
1/2
✓ Branch 1 taken 2472891 times.
✗ Branch 2 not taken.
2472891 if (PyUnicode_IS_ASCII(self))
11644 2472891 return ascii_upper_or_lower(self, 1);
11645 return case_operation(self, do_lower);
11646 }
11647
11648 #define LEFTSTRIP 0
11649 #define RIGHTSTRIP 1
11650 #define BOTHSTRIP 2
11651
11652 /* Arrays indexed by above */
11653 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
11654
11655 #define STRIPNAME(i) (stripfuncnames[i])
11656
11657 /* externally visible for str.strip(unicode) */
11658 PyObject *
11659 9354035 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11660 {
11661 const void *data;
11662 int kind;
11663 Py_ssize_t i, j, len;
11664 BLOOM_MASK sepmask;
11665 Py_ssize_t seplen;
11666
11667 9354035 kind = PyUnicode_KIND(self);
11668 9354035 data = PyUnicode_DATA(self);
11669 9354035 len = PyUnicode_GET_LENGTH(self);
11670 9354035 seplen = PyUnicode_GET_LENGTH(sepobj);
11671 9354035 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11672 9354035 PyUnicode_DATA(sepobj),
11673 seplen);
11674
11675 9354035 i = 0;
11676
2/2
✓ Branch 0 taken 627467 times.
✓ Branch 1 taken 8726568 times.
9354035 if (striptype != RIGHTSTRIP) {
11677
2/2
✓ Branch 0 taken 1134186 times.
✓ Branch 1 taken 24 times.
1134210 while (i < len) {
11678 1134186 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11679
2/2
✓ Branch 0 taken 611868 times.
✓ Branch 1 taken 522318 times.
1134186 if (!BLOOM(sepmask, ch))
11680 611868 break;
11681
2/2
✓ Branch 1 taken 15575 times.
✓ Branch 2 taken 506743 times.
522318 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11682 15575 break;
11683 506743 i++;
11684 }
11685 }
11686
11687 9354035 j = len;
11688
2/2
✓ Branch 0 taken 8811902 times.
✓ Branch 1 taken 542133 times.
9354035 if (striptype != LEFTSTRIP) {
11689 8811902 j--;
11690
2/2
✓ Branch 0 taken 9941994 times.
✓ Branch 1 taken 24 times.
9942018 while (j >= i) {
11691 9941994 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11692
2/2
✓ Branch 0 taken 6856118 times.
✓ Branch 1 taken 3085876 times.
9941994 if (!BLOOM(sepmask, ch))
11693 6856118 break;
11694
2/2
✓ Branch 1 taken 1955760 times.
✓ Branch 2 taken 1130116 times.
3085876 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11695 1955760 break;
11696 1130116 j--;
11697 }
11698
11699 8811902 j++;
11700 }
11701
11702 9354035 return PyUnicode_Substring(self, i, j);
11703 }
11704
11705 PyObject*
11706 38862540 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11707 {
11708 const unsigned char *data;
11709 int kind;
11710 Py_ssize_t length;
11711
11712 38862540 length = PyUnicode_GET_LENGTH(self);
11713 38862540 end = Py_MIN(end, length);
11714
11715
4/4
✓ Branch 0 taken 23025152 times.
✓ Branch 1 taken 15837388 times.
✓ Branch 2 taken 13707167 times.
✓ Branch 3 taken 9317985 times.
38862540 if (start == 0 && end == length)
11716 13707167 return unicode_result_unchanged(self);
11717
11718
2/4
✓ Branch 0 taken 25155373 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 25155373 times.
25155373 if (start < 0 || end < 0) {
11719 PyErr_SetString(PyExc_IndexError, "string index out of range");
11720 return NULL;
11721 }
11722
3/4
✓ Branch 0 taken 25123980 times.
✓ Branch 1 taken 31393 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 25123980 times.
25155373 if (start >= length || end < start)
11723 31393 _Py_RETURN_UNICODE_EMPTY();
11724
11725 25123980 length = end - start;
11726
2/2
✓ Branch 1 taken 25019877 times.
✓ Branch 2 taken 104103 times.
25123980 if (PyUnicode_IS_ASCII(self)) {
11727 25019877 data = PyUnicode_1BYTE_DATA(self);
11728 25019877 return _PyUnicode_FromASCII((const char*)(data + start), length);
11729 }
11730 else {
11731 104103 kind = PyUnicode_KIND(self);
11732 104103 data = PyUnicode_1BYTE_DATA(self);
11733 104103 return PyUnicode_FromKindAndData(kind,
11734 104103 data + kind * start,
11735 length);
11736 }
11737 }
11738
11739 static PyObject *
11740 2324715 do_strip(PyObject *self, int striptype)
11741 {
11742 Py_ssize_t len, i, j;
11743
11744 2324715 len = PyUnicode_GET_LENGTH(self);
11745
11746
1/2
✓ Branch 1 taken 2324715 times.
✗ Branch 2 not taken.
2324715 if (PyUnicode_IS_ASCII(self)) {
11747 2324715 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
11748
11749 2324715 i = 0;
11750
2/2
✓ Branch 0 taken 908865 times.
✓ Branch 1 taken 1415850 times.
2324715 if (striptype != RIGHTSTRIP) {
11751
2/2
✓ Branch 0 taken 1462627 times.
✓ Branch 1 taken 103127 times.
1565754 while (i < len) {
11752 1462627 Py_UCS1 ch = data[i];
11753
2/2
✓ Branch 0 taken 805738 times.
✓ Branch 1 taken 656889 times.
1462627 if (!_Py_ascii_whitespace[ch])
11754 805738 break;
11755 656889 i++;
11756 }
11757 }
11758
11759 2324715 j = len;
11760
2/2
✓ Branch 0 taken 2323608 times.
✓ Branch 1 taken 1107 times.
2324715 if (striptype != LEFTSTRIP) {
11761 2323608 j--;
11762
2/2
✓ Branch 0 taken 3037458 times.
✓ Branch 1 taken 156087 times.
3193545 while (j >= i) {
11763 3037458 Py_UCS1 ch = data[j];
11764
2/2
✓ Branch 0 taken 2167521 times.
✓ Branch 1 taken 869937 times.
3037458 if (!_Py_ascii_whitespace[ch])
11765 2167521 break;
11766 869937 j--;
11767 }
11768 2323608 j++;
11769 }
11770 }
11771 else {
11772 int kind = PyUnicode_KIND(self);
11773 const void *data = PyUnicode_DATA(self);
11774
11775 i = 0;
11776 if (striptype != RIGHTSTRIP) {
11777 while (i < len) {
11778 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11779 if (!Py_UNICODE_ISSPACE(ch))
11780 break;
11781 i++;
11782 }
11783 }
11784
11785 j = len;
11786 if (striptype != LEFTSTRIP) {
11787 j--;
11788 while (j >= i) {
11789 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11790 if (!Py_UNICODE_ISSPACE(ch))
11791 break;
11792 j--;
11793 }
11794 j++;
11795 }
11796 }
11797
11798 2324715 return PyUnicode_Substring(self, i, j);
11799 }
11800
11801
11802 static PyObject *
11803 11678750 do_argstrip(PyObject *self, int striptype, PyObject *sep)
11804 {
11805
2/2
✓ Branch 0 taken 9354035 times.
✓ Branch 1 taken 2324715 times.
11678750 if (sep != Py_None) {
11806
1/2
✓ Branch 2 taken 9354035 times.
✗ Branch 3 not taken.
9354035 if (PyUnicode_Check(sep))
11807 9354035 return _PyUnicode_XStrip(self, striptype, sep);
11808 else {
11809 PyErr_Format(PyExc_TypeError,
11810 "%s arg must be None or str",
11811 STRIPNAME(striptype));
11812 return NULL;
11813 }
11814 }
11815
11816 2324715 return do_strip(self, striptype);
11817 }
11818
11819
11820 /*[clinic input]
11821 str.strip as unicode_strip
11822
11823 chars: object = None
11824 /
11825
11826 Return a copy of the string with leading and trailing whitespace removed.
11827
11828 If chars is given and not None, remove characters in chars instead.
11829 [clinic start generated code]*/
11830
11831 static PyObject *
11832 993092 unicode_strip_impl(PyObject *self, PyObject *chars)
11833 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
11834 {
11835 993092 return do_argstrip(self, BOTHSTRIP, chars);
11836 }
11837
11838
11839 /*[clinic input]
11840 str.lstrip as unicode_lstrip
11841
11842 chars: object = None
11843 /
11844
11845 Return a copy of the string with leading whitespace removed.
11846
11847 If chars is given and not None, remove characters in chars instead.
11848 [clinic start generated code]*/
11849
11850 static PyObject *
11851 543240 unicode_lstrip_impl(PyObject *self, PyObject *chars)
11852 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
11853 {
11854 543240 return do_argstrip(self, LEFTSTRIP, chars);
11855 }
11856
11857
11858 /*[clinic input]
11859 str.rstrip as unicode_rstrip
11860
11861 chars: object = None
11862 /
11863
11864 Return a copy of the string with trailing whitespace removed.
11865
11866 If chars is given and not None, remove characters in chars instead.
11867 [clinic start generated code]*/
11868
11869 static PyObject *
11870 10142418 unicode_rstrip_impl(PyObject *self, PyObject *chars)
11871 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
11872 {
11873 10142418 return do_argstrip(self, RIGHTSTRIP, chars);
11874 }
11875
11876
11877 static PyObject*
11878 409988 unicode_repeat(PyObject *str, Py_ssize_t len)
11879 {
11880 PyObject *u;
11881 Py_ssize_t nchars, n;
11882
11883
2/2
✓ Branch 0 taken 24537 times.
✓ Branch 1 taken 385451 times.
409988 if (len < 1)
11884 24537 _Py_RETURN_UNICODE_EMPTY();
11885
11886 /* no repeat, return original string */
11887
2/2
✓ Branch 0 taken 80742 times.
✓ Branch 1 taken 304709 times.
385451 if (len == 1)
11888 80742 return unicode_result_unchanged(str);
11889
11890
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 304709 times.
304709 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11891 PyErr_SetString(PyExc_OverflowError,
11892 "repeated string is too long");
11893 return NULL;
11894 }
11895 304709 nchars = len * PyUnicode_GET_LENGTH(str);
11896
11897 304709 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11898
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 304709 times.
304709 if (!u)
11899 return NULL;
11900 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11901
11902
2/2
✓ Branch 1 taken 218085 times.
✓ Branch 2 taken 86624 times.
304709 if (PyUnicode_GET_LENGTH(str) == 1) {
11903 218085 int kind = PyUnicode_KIND(str);
11904 218085 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11905
1/2
✓ Branch 0 taken 218085 times.
✗ Branch 1 not taken.
218085 if (kind == PyUnicode_1BYTE_KIND) {
11906 218085 void *to = PyUnicode_DATA(u);
11907 218085 memset(to, (unsigned char)fill_char, len);
11908 }
11909 else if (kind == PyUnicode_2BYTE_KIND) {
11910 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
11911 for (n = 0; n < len; ++n)
11912 ucs2[n] = fill_char;
11913 } else {
11914 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11915 assert(kind == PyUnicode_4BYTE_KIND);
11916 for (n = 0; n < len; ++n)
11917 ucs4[n] = fill_char;
11918 }
11919 }
11920 else {
11921 86624 Py_ssize_t char_size = PyUnicode_KIND(str);
11922 86624 char *to = (char *) PyUnicode_DATA(u);
11923 86624 _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
11924 86624 PyUnicode_GET_LENGTH(str) * char_size);
11925 }
11926
11927 assert(_PyUnicode_CheckConsistency(u, 1));
11928 304709 return u;
11929 }
11930
11931 PyObject *
11932 32 PyUnicode_Replace(PyObject *str,
11933 PyObject *substr,
11934 PyObject *replstr,
11935 Py_ssize_t maxcount)
11936 {
11937
3/6
✓ Branch 1 taken 32 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 32 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 32 times.
64 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
11938 32 ensure_unicode(replstr) < 0)
11939 return NULL;
11940 32 return replace(str, substr, replstr, maxcount);
11941 }
11942
11943 /*[clinic input]
11944 str.replace as unicode_replace
11945
11946 old: unicode
11947 new: unicode
11948 count: Py_ssize_t = -1
11949 Maximum number of occurrences to replace.
11950 -1 (the default value) means replace all occurrences.
11951 /
11952
11953 Return a copy with all occurrences of substring old replaced by new.
11954
11955 If the optional argument count is given, only the first count occurrences are
11956 replaced.
11957 [clinic start generated code]*/
11958
11959 static PyObject *
11960 14796883 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
11961 Py_ssize_t count)
11962 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
11963 {
11964 14796883 return replace(self, old, new, count);
11965 }
11966
11967 /*[clinic input]
11968 str.removeprefix as unicode_removeprefix
11969
11970 prefix: unicode
11971 /
11972
11973 Return a str with the given prefix string removed if present.
11974
11975 If the string starts with the prefix string, return string[len(prefix):].
11976 Otherwise, return a copy of the original string.
11977 [clinic start generated code]*/
11978
11979 static PyObject *
11980 unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
11981 /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
11982 {
11983 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
11984 if (match == -1) {
11985 return NULL;
11986 }
11987 if (match) {
11988 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
11989 PyUnicode_GET_LENGTH(self));
11990 }
11991 return unicode_result_unchanged(self);
11992 }
11993
11994 /*[clinic input]
11995 str.removesuffix as unicode_removesuffix
11996
11997 suffix: unicode
11998 /
11999
12000 Return a str with the given suffix string removed if present.
12001
12002 If the string ends with the suffix string and that suffix is not empty,
12003 return string[:-len(suffix)]. Otherwise, return a copy of the original
12004 string.
12005 [clinic start generated code]*/
12006
12007 static PyObject *
12008 unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12009 /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12010 {
12011 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12012 if (match == -1) {
12013 return NULL;
12014 }
12015 if (match) {
12016 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12017 - PyUnicode_GET_LENGTH(suffix));
12018 }
12019 return unicode_result_unchanged(self);
12020 }
12021
12022 static PyObject *
12023 250844 unicode_repr(PyObject *unicode)
12024 {
12025 PyObject *repr;
12026 Py_ssize_t isize;
12027 Py_ssize_t osize, squote, dquote, i, o;
12028 Py_UCS4 max, quote;
12029 int ikind, okind, unchanged;
12030 const void *idata;
12031 void *odata;
12032
12033 250844 isize = PyUnicode_GET_LENGTH(unicode);
12034 250844 idata = PyUnicode_DATA(unicode);
12035
12036 /* Compute length of output, quote characters, and
12037 maximum character */
12038 250844 osize = 0;
12039 250844 max = 127;
12040 250844 squote = dquote = 0;
12041 250844 ikind = PyUnicode_KIND(unicode);
12042
2/2
✓ Branch 0 taken 11963915 times.
✓ Branch 1 taken 250844 times.
12214759 for (i = 0; i < isize; i++) {
12043 11963915 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12044 11963915 Py_ssize_t incr = 1;
12045
4/4
✓ Branch 0 taken 8072 times.
✓ Branch 1 taken 6574 times.
✓ Branch 2 taken 125858 times.
✓ Branch 3 taken 11823411 times.
11963915 switch (ch) {
12046 8072 case '\'': squote++; break;
12047 6574 case '"': dquote++; break;
12048 125858 case '\\': case '\t': case '\r': case '\n':
12049 125858 incr = 2;
12050 125858 break;
12051 11823411 default:
12052 /* Fast-path ASCII */
12053
4/4
✓ Branch 0 taken 11823324 times.
✓ Branch 1 taken 87 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 11823321 times.
11823411 if (ch < ' ' || ch == 0x7f)
12054 90 incr = 4; /* \xHH */
12055
2/2
✓ Branch 0 taken 431 times.
✓ Branch 1 taken 11822890 times.
11823321 else if (ch < 0x7f)
12056 ;
12057
2/2
✓ Branch 1 taken 406 times.
✓ Branch 2 taken 25 times.
431 else if (Py_UNICODE_ISPRINTABLE(ch))
12058 406 max = ch > max ? ch : max;
12059
1/2
✓ Branch 0 taken 25 times.
✗ Branch 1 not taken.
25 else if (ch < 0x100)
12060 25 incr = 4; /* \xHH */
12061 else if (ch < 0x10000)
12062 incr = 6; /* \uHHHH */
12063 else
12064 incr = 10; /* \uHHHHHHHH */
12065 }
12066
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 11963915 times.
11963915 if (osize > PY_SSIZE_T_MAX - incr) {
12067 PyErr_SetString(PyExc_OverflowError,
12068 "string is too long to generate repr");
12069 return NULL;
12070 }
12071 11963915 osize += incr;
12072 }
12073
12074 250844 quote = '\'';
12075 250844 unchanged = (osize == isize);
12076
2/2
✓ Branch 0 taken 4188 times.
✓ Branch 1 taken 246656 times.
250844 if (squote) {
12077 4188 unchanged = 0;
12078
2/2
✓ Branch 0 taken 617 times.
✓ Branch 1 taken 3571 times.
4188 if (dquote)
12079 /* Both squote and dquote present. Use squote,
12080 and escape them */
12081 617 osize += squote;
12082 else
12083 3571 quote = '"';
12084 }
12085 250844 osize += 2; /* quotes */
12086
12087 250844 repr = PyUnicode_New(osize, max);
12088
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 250844 times.
250844 if (repr == NULL)
12089 return NULL;
12090 250844 okind = PyUnicode_KIND(repr);
12091 250844 odata = PyUnicode_DATA(repr);
12092
12093 250844 PyUnicode_WRITE(okind, odata, 0, quote);
12094 250844 PyUnicode_WRITE(okind, odata, osize-1, quote);
12095
2/2
✓ Branch 0 taken 238494 times.
✓ Branch 1 taken 12350 times.
250844 if (unchanged) {
12096 238494 _PyUnicode_FastCopyCharacters(repr, 1,
12097 unicode, 0,
12098 isize);
12099 }
12100 else {
12101
2/2
✓ Branch 0 taken 3284804 times.
✓ Branch 1 taken 12350 times.
3297154 for (i = 0, o = 1; i < isize; i++) {
12102 3284804 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12103
12104 /* Escape quotes and backslashes */
12105
4/4
✓ Branch 0 taken 3284017 times.
✓ Branch 1 taken 787 times.
✓ Branch 2 taken 57945 times.
✓ Branch 3 taken 3226072 times.
3284804 if ((ch == quote) || (ch == '\\')) {
12106 58732 PyUnicode_WRITE(okind, odata, o++, '\\');
12107 58732 PyUnicode_WRITE(okind, odata, o++, ch);
12108 58732 continue;
12109 }
12110
12111 /* Map special whitespace to '\t', \n', '\r' */
12112
2/2
✓ Branch 0 taken 9 times.
✓ Branch 1 taken 3226063 times.
3226072 if (ch == '\t') {
12113 9 PyUnicode_WRITE(okind, odata, o++, '\\');
12114 9 PyUnicode_WRITE(okind, odata, o++, 't');
12115 }
12116
2/2
✓ Branch 0 taken 67891 times.
✓ Branch 1 taken 3158172 times.
3226063 else if (ch == '\n') {
12117 67891 PyUnicode_WRITE(okind, odata, o++, '\\');
12118 67891 PyUnicode_WRITE(okind, odata, o++, 'n');
12119 }
12120
2/2
✓ Branch 0 taken 13 times.
✓ Branch 1 taken 3158159 times.
3158172 else if (ch == '\r') {
12121 13 PyUnicode_WRITE(okind, odata, o++, '\\');
12122 13 PyUnicode_WRITE(okind, odata, o++, 'r');
12123 }
12124
12125 /* Map non-printable US ASCII to '\xhh' */
12126
4/4
✓ Branch 0 taken 3158072 times.
✓ Branch 1 taken 87 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 3158069 times.
3158159 else if (ch < ' ' || ch == 0x7F) {
12127 90 PyUnicode_WRITE(okind, odata, o++, '\\');
12128 90 PyUnicode_WRITE(okind, odata, o++, 'x');
12129 90 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12130 90 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12131 }
12132
12133 /* Copy ASCII characters as-is */
12134
2/2
✓ Branch 0 taken 3157653 times.
✓ Branch 1 taken 416 times.
3158069 else if (ch < 0x7F) {
12135 3157653 PyUnicode_WRITE(okind, odata, o++, ch);
12136 }
12137
12138 /* Non-ASCII characters */
12139 else {
12140 /* Map Unicode whitespace and control characters
12141 (categories Z* and C* except ASCII space)
12142 */
12143
2/2
✓ Branch 1 taken 25 times.
✓ Branch 2 taken 391 times.
416 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12144 25 PyUnicode_WRITE(okind, odata, o++, '\\');
12145 /* Map 8-bit characters to '\xhh' */
12146
1/2
✓ Branch 0 taken 25 times.
✗ Branch 1 not taken.
25 if (ch <= 0xff) {
12147 25 PyUnicode_WRITE(okind, odata, o++, 'x');
12148 25 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12149 25 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12150 }
12151 /* Map 16-bit characters to '\uxxxx' */
12152 else if (ch <= 0xffff) {
12153 PyUnicode_WRITE(okind, odata, o++, 'u');
12154 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12155 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12156 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12157 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12158 }
12159 /* Map 21-bit characters to '\U00xxxxxx' */
12160 else {
12161 PyUnicode_WRITE(okind, odata, o++, 'U');
12162 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12163 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12164 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12165 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12166 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12167 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12168 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12169 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12170 }
12171 }
12172 /* Copy characters as-is */
12173 else {
12174 391 PyUnicode_WRITE(okind, odata, o++, ch);
12175 }
12176 }
12177 }
12178 }
12179 /* Closing quote already added at the beginning */
12180 assert(_PyUnicode_CheckConsistency(repr, 1));
12181 250844 return repr;
12182 }
12183
12184 PyDoc_STRVAR(rfind__doc__,
12185 "S.rfind(sub[, start[, end]]) -> int\n\
12186 \n\
12187 Return the highest index in S where substring sub is found,\n\
12188 such that sub is contained within S[start:end]. Optional\n\
12189 arguments start and end are interpreted as in slice notation.\n\
12190 \n\
12191 Return -1 on failure.");
12192
12193 static PyObject *
12194 1695927 unicode_rfind(PyObject *self, PyObject *args)
12195 {
12196 /* initialize variables to prevent gcc warning */
12197 1695927 PyObject *substring = NULL;
12198 1695927 Py_ssize_t start = 0;
12199 1695927 Py_ssize_t end = 0;
12200 Py_ssize_t result;
12201
12202
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 1695927 times.
1695927 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
12203 return NULL;
12204
12205 1695927 result = any_find_slice(self, substring, start, end, -1);
12206
12207
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1695927 times.
1695927 if (result == -2)
12208 return NULL;
12209
12210 1695927 return PyLong_FromSsize_t(result);
12211 }
12212
12213 PyDoc_STRVAR(rindex__doc__,
12214 "S.rindex(sub[, start[, end]]) -> int\n\
12215 \n\
12216 Return the highest index in S where substring sub is found,\n\
12217 such that sub is contained within S[start:end]. Optional\n\
12218 arguments start and end are interpreted as in slice notation.\n\
12219 \n\
12220 Raises ValueError when the substring is not found.");
12221
12222 static PyObject *
12223 11574 unicode_rindex(PyObject *self, PyObject *args)
12224 {
12225 /* initialize variables to prevent gcc warning */
12226 11574 PyObject *substring = NULL;
12227 11574 Py_ssize_t start = 0;
12228 11574 Py_ssize_t end = 0;
12229 Py_ssize_t result;
12230
12231
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 11574 times.
11574 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
12232 return NULL;
12233
12234 11574 result = any_find_slice(self, substring, start, end, -1);
12235
12236
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 11574 times.
11574 if (result == -2)
12237 return NULL;
12238
12239
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 11574 times.
11574 if (result < 0) {
12240 PyErr_SetString(PyExc_ValueError, "substring not found");
12241 return NULL;
12242 }
12243
12244 11574 return PyLong_FromSsize_t(result);
12245 }
12246
12247 /*[clinic input]
12248 str.rjust as unicode_rjust
12249
12250 width: Py_ssize_t
12251 fillchar: Py_UCS4 = ' '
12252 /
12253
12254 Return a right-justified string of length width.
12255
12256 Padding is done using the specified fill character (default is a space).
12257 [clinic start generated code]*/
12258
12259 static PyObject *
12260 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12261 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12262 {
12263 if (PyUnicode_GET_LENGTH(self) >= width)
12264 return unicode_result_unchanged(self);
12265
12266 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12267 }
12268
12269 PyObject *
12270 81648 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12271 {
12272
3/6
✓ Branch 1 taken 81648 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 81648 times.
✗ Branch 4 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 81648 times.
81648 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12273 return NULL;
12274
12275 81648 return split(s, sep, maxsplit);
12276 }
12277
12278 /*[clinic input]
12279 str.split as unicode_split
12280
12281 sep: object = None
12282 The separator used to split the string.
12283
12284 When set to None (the default value), will split on any whitespace
12285 character (including \\n \\r \\t \\f and spaces) and will discard
12286 empty strings from the result.
12287 maxsplit: Py_ssize_t = -1
12288 Maximum number of splits (starting from the left).
12289 -1 (the default value) means no limit.
12290
12291 Return a list of the substrings in the string, using sep as the separator string.
12292
12293 Note, str.split() is mainly useful for data that has been intentionally
12294 delimited. With natural text that includes punctuation, consider using
12295 the regular expression module.
12296
12297 [clinic start generated code]*/
12298
12299 static PyObject *
12300 1239375 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12301 /*[clinic end generated code: output=3a65b1db356948dc input=906d953b44efc43b]*/
12302 {
12303
2/2
✓ Branch 0 taken 93325 times.
✓ Branch 1 taken 1146050 times.
1239375 if (sep == Py_None)
12304 93325 return split(self, NULL, maxsplit);
12305
1/2
✓ Branch 2 taken 1146050 times.
✗ Branch 3 not taken.
1146050 if (PyUnicode_Check(sep))
12306 1146050 return split(self, sep, maxsplit);
12307
12308 PyErr_Format(PyExc_TypeError,
12309 "must be str or None, not %.100s",
12310 Py_TYPE(sep)->tp_name);
12311 return NULL;
12312 }
12313
12314 PyObject *
12315 157718 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12316 {
12317 PyObject* out;
12318 int kind1, kind2;
12319 const void *buf1, *buf2;
12320 Py_ssize_t len1, len2;
12321
12322
2/4
✓ Branch 1 taken 157718 times.
✗ Branch 2 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 157718 times.
157718 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12323 return NULL;
12324
12325 157718 kind1 = PyUnicode_KIND(str_obj);
12326 157718 kind2 = PyUnicode_KIND(sep_obj);
12327 157718 len1 = PyUnicode_GET_LENGTH(str_obj);
12328 157718 len2 = PyUnicode_GET_LENGTH(sep_obj);
12329
3/4
✓ Branch 0 taken 157718 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1656 times.
✓ Branch 3 taken 156062 times.
157718 if (kind1 < kind2 || len1 < len2) {
12330 1656 PyObject *empty = unicode_get_empty(); // Borrowed reference
12331 1656 return PyTuple_Pack(3, str_obj, empty, empty);
12332 }
12333 156062 buf1 = PyUnicode_DATA(str_obj);
12334 156062 buf2 = PyUnicode_DATA(sep_obj);
12335
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 156062 times.
156062 if (kind2 != kind1) {
12336 buf2 = unicode_askind(kind2, buf2, len2, kind1);
12337 if (!buf2)
12338 return NULL;
12339 }
12340
12341
1/4
✓ Branch 0 taken 156062 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
156062 switch (kind1) {
12342 156062 case PyUnicode_1BYTE_KIND:
12343
2/4
✓ Branch 1 taken 156062 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 156062 times.
✗ Branch 5 not taken.
156062 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12344 156062 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12345 else
12346 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12347 156062 break;
12348 case PyUnicode_2BYTE_KIND:
12349 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12350 break;
12351 case PyUnicode_4BYTE_KIND:
12352 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12353 break;
12354 default:
12355 Py_UNREACHABLE();
12356 }
12357
12358 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12359
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 156062 times.
156062 if (kind2 != kind1)
12360 PyMem_Free((void *)buf2);
12361
12362 156062 return out;
12363 }
12364
12365
12366 PyObject *
12367 4156878 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12368 {
12369 PyObject* out;
12370 int kind1, kind2;
12371 const void *buf1, *buf2;
12372 Py_ssize_t len1, len2;
12373
12374
2/4
✓ Branch 1 taken 4156878 times.
✗ Branch 2 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 4156878 times.
4156878 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12375 return NULL;
12376
12377 4156878 kind1 = PyUnicode_KIND(str_obj);
12378 4156878 kind2 = PyUnicode_KIND(sep_obj);
12379 4156878 len1 = PyUnicode_GET_LENGTH(str_obj);
12380 4156878 len2 = PyUnicode_GET_LENGTH(sep_obj);
12381
3/4
✓ Branch 0 taken 4156878 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 687 times.
✓ Branch 3 taken 4156191 times.
4156878 if (kind1 < kind2 || len1 < len2) {
12382 687 PyObject *empty = unicode_get_empty(); // Borrowed reference
12383 687 return PyTuple_Pack(3, empty, empty, str_obj);
12384 }
12385 4156191 buf1 = PyUnicode_DATA(str_obj);
12386 4156191 buf2 = PyUnicode_DATA(sep_obj);
12387
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 4156191 times.
4156191 if (kind2 != kind1) {
12388 buf2 = unicode_askind(kind2, buf2, len2, kind1);
12389 if (!buf2)
12390 return NULL;
12391 }
12392
12393
1/4
✓ Branch 0 taken 4156191 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
4156191 switch (kind1) {
12394 4156191 case PyUnicode_1BYTE_KIND:
12395
2/4
✓ Branch 1 taken 4156191 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 4156191 times.
✗ Branch 5 not taken.
4156191 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12396 4156191 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12397 else
12398 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12399 4156191 break;
12400 case PyUnicode_2BYTE_KIND:
12401 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12402 break;
12403 case PyUnicode_4BYTE_KIND:
12404 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12405 break;
12406 default:
12407 Py_UNREACHABLE();
12408 }
12409
12410 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12411
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 4156191 times.
4156191 if (kind2 != kind1)
12412 PyMem_Free((void *)buf2);
12413
12414 4156191 return out;
12415 }
12416
12417 /*[clinic input]
12418 str.partition as unicode_partition
12419
12420 sep: object
12421 /
12422
12423 Partition the string into three parts using the given separator.
12424
12425 This will search for the separator in the string. If the separator is found,
12426 returns a 3-tuple containing the part before the separator, the separator
12427 itself, and the part after it.
12428
12429 If the separator is not found, returns a 3-tuple containing the original string
12430 and two empty strings.
12431 [clinic start generated code]*/
12432
12433 static PyObject *
12434 157718 unicode_partition(PyObject *self, PyObject *sep)
12435 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
12436 {
12437 157718 return PyUnicode_Partition(self, sep);
12438 }
12439
12440 /*[clinic input]
12441 str.rpartition as unicode_rpartition = str.partition
12442
12443 Partition the string into three parts using the given separator.
12444
12445 This will search for the separator in the string, starting at the end. If
12446 the separator is found, returns a 3-tuple containing the part before the
12447 separator, the separator itself, and the part after it.
12448
12449 If the separator is not found, returns a 3-tuple containing two empty strings
12450 and the original string.
12451 [clinic start generated code]*/
12452
12453 static PyObject *
12454 4156878 unicode_rpartition(PyObject *self, PyObject *sep)
12455 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
12456 {
12457 4156878 return PyUnicode_RPartition(self, sep);
12458 }
12459
12460 PyObject *
12461 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12462 {
12463 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12464 return NULL;
12465
12466 return rsplit(s, sep, maxsplit);
12467 }
12468
12469 /*[clinic input]
12470 str.rsplit as unicode_rsplit = str.split
12471
12472 Return a list of the substrings in the string, using sep as the separator string.
12473
12474 Splitting starts at the end of the string and works to the front.
12475 [clinic start generated code]*/
12476
12477 static PyObject *
12478 9567 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12479 /*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
12480 {
12481
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 9567 times.
9567 if (sep == Py_None)
12482 return rsplit(self, NULL, maxsplit);
12483
1/2
✓ Branch 2 taken 9567 times.
✗ Branch 3 not taken.
9567 if (PyUnicode_Check(sep))
12484 9567 return rsplit(self, sep, maxsplit);
12485
12486 PyErr_Format(PyExc_TypeError,
12487 "must be str or None, not %.100s",
12488 Py_TYPE(sep)->tp_name);
12489 return NULL;
12490 }
12491
12492 /*[clinic input]
12493 str.splitlines as unicode_splitlines
12494
12495 keepends: bool(accept={int}) = False
12496
12497 Return a list of the lines in the string, breaking at line boundaries.
12498
12499 Line breaks are not included in the resulting list unless keepends is given and
12500 true.
12501 [clinic start generated code]*/
12502
12503 static PyObject *
12504 24472 unicode_splitlines_impl(PyObject *self, int keepends)
12505 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
12506 {
12507 24472 return PyUnicode_Splitlines(self, keepends);
12508 }
12509
12510 static
12511 1120 PyObject *unicode_str(PyObject *self)
12512 {
12513 1120 return unicode_result_unchanged(self);
12514 }
12515
12516 /*[clinic input]
12517 str.swapcase as unicode_swapcase
12518
12519 Convert uppercase characters to lowercase and lowercase characters to uppercase.
12520 [clinic start generated code]*/
12521
12522 static PyObject *
12523 unicode_swapcase_impl(PyObject *self)
12524 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
12525 {
12526 return case_operation(self, do_swapcase);
12527 }
12528
12529 /*[clinic input]
12530
12531 @staticmethod
12532 str.maketrans as unicode_maketrans
12533
12534 x: object
12535
12536 y: unicode=NULL
12537
12538 z: unicode=NULL
12539
12540 /
12541
12542 Return a translation table usable for str.translate().
12543
12544 If there is only one argument, it must be a dictionary mapping Unicode
12545 ordinals (integers) or characters to Unicode ordinals, strings or None.
12546 Character keys will be then converted to ordinals.
12547 If there are two arguments, they must be strings of equal length, and
12548 in the resulting dictionary, each character in x will be mapped to the
12549 character at the same position in y. If there is a third argument, it
12550 must be a string, whose characters will be mapped to None in the result.
12551 [clinic start generated code]*/
12552
12553 static PyObject *
12554 250 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
12555 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
12556 {
12557 250 PyObject *new = NULL, *key, *value;
12558 250 Py_ssize_t i = 0;
12559 int res;
12560
12561 250 new = PyDict_New();
12562
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 250 times.
250 if (!new)
12563 return NULL;
12564
1/2
✓ Branch 0 taken 250 times.
✗ Branch 1 not taken.
250 if (y != NULL) {
12565 int x_kind, y_kind, z_kind;
12566 const void *x_data, *y_data, *z_data;
12567
12568 /* x must be a string too, of equal length */
12569
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 250 times.
250 if (!PyUnicode_Check(x)) {
12570 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12571 "be a string if there is a second argument");
12572 goto err;
12573 }
12574
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 250 times.
250 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12575 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12576 "arguments must have equal length");
12577 goto err;
12578 }
12579 /* create entries for translating chars in x to those in y */
12580 250 x_kind = PyUnicode_KIND(x);
12581 250 y_kind = PyUnicode_KIND(y);
12582 250 x_data = PyUnicode_DATA(x);
12583 250 y_data = PyUnicode_DATA(y);
12584
2/2
✓ Branch 1 taken 250 times.
✓ Branch 2 taken 250 times.
500 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12585 250 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12586
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 250 times.
250 if (!key)
12587 goto err;
12588 250 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12589
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 250 times.
250 if (!value) {
12590 Py_DECREF(key);
12591 goto err;
12592 }
12593 250 res = PyDict_SetItem(new, key, value);
12594 250 Py_DECREF(key);
12595 250 Py_DECREF(value);
12596
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 250 times.
250 if (res < 0)
12597 goto err;
12598 }
12599 /* create entries for deleting chars in z */
12600
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 250 times.
250 if (z != NULL) {
12601 z_kind = PyUnicode_KIND(z);
12602 z_data = PyUnicode_DATA(z);
12603 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
12604 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12605 if (!key)
12606 goto err;
12607 res = PyDict_SetItem(new, key, Py_None);
12608 Py_DECREF(key);
12609 if (res < 0)
12610 goto err;
12611 }
12612 }
12613 } else {
12614 int kind;
12615 const void *data;
12616
12617 /* x must be a dict */
12618 if (!PyDict_CheckExact(x)) {
12619 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12620 "to maketrans it must be a dict");
12621 goto err;
12622 }
12623 /* copy entries into the new dict, converting string keys to int keys */
12624 while (PyDict_Next(x, &i, &key, &value)) {
12625 if (PyUnicode_Check(key)) {
12626 /* convert string keys to integer keys */
12627 PyObject *newkey;
12628 if (PyUnicode_GET_LENGTH(key) != 1) {
12629 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12630 "table must be of length 1");
12631 goto err;
12632 }
12633 kind = PyUnicode_KIND(key);
12634 data = PyUnicode_DATA(key);
12635 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
12636 if (!newkey)
12637 goto err;
12638 res = PyDict_SetItem(new, newkey, value);
12639 Py_DECREF(newkey);
12640 if (res < 0)
12641 goto err;
12642 } else if (PyLong_Check(key)) {
12643 /* just keep integer keys */
12644 if (PyDict_SetItem(new, key, value) < 0)
12645 goto err;
12646 } else {
12647 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12648 "be strings or integers");
12649 goto err;
12650 }
12651 }
12652 }
12653 250 return new;
12654 err:
12655 Py_DECREF(new);
12656 return NULL;
12657 }
12658
12659 /*[clinic input]
12660 str.translate as unicode_translate
12661
12662 table: object
12663 Translation table, which must be a mapping of Unicode ordinals to
12664 Unicode ordinals, strings, or None.
12665 /
12666
12667 Replace each character in the string using the given translation table.
12668
12669 The table must implement lookup/indexing via __getitem__, for instance a
12670 dictionary or list. If this operation raises LookupError, the character is
12671 left untouched. Characters mapped to None are deleted.
12672 [clinic start generated code]*/
12673
12674 static PyObject *
12675 720739 unicode_translate(PyObject *self, PyObject *table)
12676 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
12677 {
12678 720739 return _PyUnicode_TranslateCharmap(self, table, "ignore");
12679 }
12680
12681 /*[clinic input]
12682 str.upper as unicode_upper
12683
12684 Return a copy of the string converted to uppercase.
12685 [clinic start generated code]*/
12686
12687 static PyObject *
12688 129369 unicode_upper_impl(PyObject *self)
12689 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
12690 {
12691
2/2
✓ Branch 1 taken 128903 times.
✓ Branch 2 taken 466 times.
129369 if (PyUnicode_IS_ASCII(self))
12692 128903 return ascii_upper_or_lower(self, 0);
12693 466 return case_operation(self, do_upper);
12694 }
12695
12696 /*[clinic input]
12697 str.zfill as unicode_zfill
12698
12699 width: Py_ssize_t
12700 /
12701
12702 Pad a numeric string with zeros on the left, to fill a field of the given width.
12703
12704 The string is never truncated.
12705 [clinic start generated code]*/
12706
12707 static PyObject *
12708 1420 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
12709 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
12710 {
12711 Py_ssize_t fill;
12712 PyObject *u;
12713 int kind;
12714 const void *data;
12715 Py_UCS4 chr;
12716
12717
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 1420 times.
1420 if (PyUnicode_GET_LENGTH(self) >= width)
12718 return unicode_result_unchanged(self);
12719
12720 1420 fill = width - PyUnicode_GET_LENGTH(self);
12721
12722 1420 u = pad(self, fill, 0, '0');
12723
12724
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1420 times.
1420 if (u == NULL)
12725 return NULL;
12726
12727 1420 kind = PyUnicode_KIND(u);
12728 1420 data = PyUnicode_DATA(u);
12729 1420 chr = PyUnicode_READ(kind, data, fill);
12730
12731
2/4
✓ Branch 0 taken 1420 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1420 times.
1420 if (chr == '+' || chr == '-') {
12732 /* move sign to beginning of string */
12733 PyUnicode_WRITE(kind, data, 0, chr);
12734 PyUnicode_WRITE(kind, data, fill, '0');
12735 }
12736
12737 assert(_PyUnicode_CheckConsistency(u, 1));
12738 1420 return u;
12739 }
12740
12741 PyDoc_STRVAR(startswith__doc__,
12742 "S.startswith(prefix[, start[, end]]) -> bool\n\
12743 \n\
12744 Return True if S starts with the specified prefix, False otherwise.\n\
12745 With optional start, test S beginning at that position.\n\
12746 With optional end, stop comparing S at that position.\n\
12747 prefix can also be a tuple of strings to try.");
12748
12749 static PyObject *
12750 11461659 unicode_startswith(PyObject *self,
12751 PyObject *args)
12752 {
12753 PyObject *subobj;
12754 PyObject *substring;
12755 11461659 Py_ssize_t start = 0;
12756 11461659 Py_ssize_t end = PY_SSIZE_T_MAX;
12757 int result;
12758
12759
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 11461659 times.
11461659 if (!asciilib_parse_args_finds("startswith", args, &subobj, &start, &end))
12760 return NULL;
12761
2/2
✓ Branch 2 taken 89175 times.
✓ Branch 3 taken 11372484 times.
11461659 if (PyTuple_Check(subobj)) {
12762 Py_ssize_t i;
12763
2/2
✓ Branch 1 taken 227944 times.
✓ Branch 2 taken 82629 times.
310573 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12764 227944 substring = PyTuple_GET_ITEM(subobj, i);
12765
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 227944 times.
227944 if (!PyUnicode_Check(substring)) {
12766 PyErr_Format(PyExc_TypeError,
12767 "tuple for startswith must only contain str, "
12768 "not %.100s",
12769 Py_TYPE(substring)->tp_name);
12770 return NULL;
12771 }
12772 227944 result = tailmatch(self, substring, start, end, -1);
12773
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 227944 times.
227944 if (result == -1)
12774 return NULL;
12775
2/2
✓ Branch 0 taken 6546 times.
✓ Branch 1 taken 221398 times.
227944 if (result) {
12776 6546 Py_RETURN_TRUE;
12777 }
12778 }
12779 /* nothing matched */
12780 82629 Py_RETURN_FALSE;
12781 }
12782
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 11372484 times.
11372484 if (!PyUnicode_Check(subobj)) {
12783 PyErr_Format(PyExc_TypeError,
12784 "startswith first arg must be str or "
12785 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
12786 return NULL;
12787 }
12788 11372484 result = tailmatch(self, subobj, start, end, -1);
12789
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 11372484 times.
11372484 if (result == -1)
12790 return NULL;
12791 11372484 return PyBool_FromLong(result);
12792 }
12793
12794
12795 PyDoc_STRVAR(endswith__doc__,
12796 "S.endswith(suffix[, start[, end]]) -> bool\n\
12797 \n\
12798 Return True if S ends with the specified suffix, False otherwise.\n\
12799 With optional start, test S beginning at that position.\n\
12800 With optional end, stop comparing S at that position.\n\
12801 suffix can also be a tuple of strings to try.");
12802
12803 static PyObject *
12804 4439804 unicode_endswith(PyObject *self,
12805 PyObject *args)
12806 {
12807 PyObject *subobj;
12808 PyObject *substring;
12809 4439804 Py_ssize_t start = 0;
12810 4439804 Py_ssize_t end = PY_SSIZE_T_MAX;
12811 int result;
12812
12813
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 4439804 times.
4439804 if (!asciilib_parse_args_finds("endswith", args, &subobj, &start, &end))
12814 return NULL;
12815
2/2
✓ Branch 2 taken 462848 times.
✓ Branch 3 taken 3976956 times.
4439804 if (PyTuple_Check(subobj)) {
12816 Py_ssize_t i;
12817
2/2
✓ Branch 1 taken 480415 times.
✓ Branch 2 taken 113097 times.
593512 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12818 480415 substring = PyTuple_GET_ITEM(subobj, i);
12819
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 480415 times.
480415 if (!PyUnicode_Check(substring)) {
12820 PyErr_Format(PyExc_TypeError,
12821 "tuple for endswith must only contain str, "
12822 "not %.100s",
12823 Py_TYPE(substring)->tp_name);
12824 return NULL;
12825 }
12826 480415 result = tailmatch(self, substring, start, end, +1);
12827
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 480415 times.
480415 if (result == -1)
12828 return NULL;
12829
2/2
✓ Branch 0 taken 349751 times.
✓ Branch 1 taken 130664 times.
480415 if (result) {
12830 349751 Py_RETURN_TRUE;
12831 }
12832 }
12833 113097 Py_RETURN_FALSE;
12834 }
12835
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 3976956 times.
3976956 if (!PyUnicode_Check(subobj)) {
12836 PyErr_Format(PyExc_TypeError,
12837 "endswith first arg must be str or "
12838 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
12839 return NULL;
12840 }
12841 3976956 result = tailmatch(self, subobj, start, end, +1);
12842
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3976956 times.
3976956 if (result == -1)
12843 return NULL;
12844 3976956 return PyBool_FromLong(result);
12845 }
12846
12847 static inline void
12848 78641672 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
12849 {
12850 78641672 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12851 78641672 writer->data = PyUnicode_DATA(writer->buffer);
12852
12853
2/2
✓ Branch 0 taken 78388845 times.
✓ Branch 1 taken 252827 times.
78641672 if (!writer->readonly) {
12854 78388845 writer->kind = PyUnicode_KIND(writer->buffer);
12855 78388845 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
12856 }
12857 else {
12858 /* use a value smaller than PyUnicode_1BYTE_KIND() so
12859 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
12860 252827 writer->kind = 0;
12861 assert(writer->kind <= PyUnicode_1BYTE_KIND);
12862
12863 /* Copy-on-write mode: set buffer size to 0 so
12864 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
12865 * next write. */
12866 252827 writer->size = 0;
12867 }
12868 78641672 }
12869
12870 void
12871 78369064 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
12872 {
12873 78369064 memset(writer, 0, sizeof(*writer));
12874
12875 /* ASCII is the bare minimum */
12876 78369064 writer->min_char = 127;
12877
12878 /* use a value smaller than PyUnicode_1BYTE_KIND() so
12879 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
12880 78369064 writer->kind = 0;
12881 assert(writer->kind <= PyUnicode_1BYTE_KIND);
12882 78369064 }
12883
12884 // Initialize _PyUnicodeWriter with initial buffer
12885 static inline void
12886 1768563 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
12887 {
12888 1768563 memset(writer, 0, sizeof(*writer));
12889 1768563 writer->buffer = buffer;
12890 1768563 _PyUnicodeWriter_Update(writer);
12891 1768563 writer->min_length = writer->size;
12892 1768563 }
12893
12894 int
12895 76620282 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12896 Py_ssize_t length, Py_UCS4 maxchar)
12897 {
12898 Py_ssize_t newlen;
12899 PyObject *newbuffer;
12900
12901 assert(maxchar <= MAX_UNICODE);
12902
12903 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
12904 assert((maxchar > writer->maxchar && length >= 0)
12905 || length > 0);
12906
12907
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 76620282 times.
76620282 if (length > PY_SSIZE_T_MAX - writer->pos) {
12908 PyErr_NoMemory();
12909 return -1;
12910 }
12911 76620282 newlen = writer->pos + length;
12912
12913 76620282 maxchar = Py_MAX(maxchar, writer->min_char);
12914
12915
2/2
✓ Branch 0 taken 74692081 times.
✓ Branch 1 taken 1928201 times.
76620282 if (writer->buffer == NULL) {
12916 assert(!writer->readonly);
12917
2/2
✓ Branch 0 taken 73486644 times.
✓ Branch 1 taken 1205437 times.
74692081 if (writer->overallocate
12918
1/2
✓ Branch 0 taken 73486644 times.
✗ Branch 1 not taken.
73486644 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
12919 /* overallocate to limit the number of realloc() */
12920 73486644 newlen += newlen / OVERALLOCATE_FACTOR;
12921 }
12922
2/2
✓ Branch 0 taken 73560448 times.
✓ Branch 1 taken 1131633 times.
74692081 if (newlen < writer->min_length)
12923 73560448 newlen = writer->min_length;
12924
12925 74692081 writer->buffer = PyUnicode_New(newlen, maxchar);
12926
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 74692081 times.
74692081 if (writer->buffer == NULL)
12927 return -1;
12928 }
12929
2/2
✓ Branch 0 taken 126278 times.
✓ Branch 1 taken 1801923 times.
1928201 else if (newlen > writer->size) {
12930
2/2
✓ Branch 0 taken 56066 times.
✓ Branch 1 taken 70212 times.
126278 if (writer->overallocate
12931
1/2
✓ Branch 0 taken 56066 times.
✗ Branch 1 not taken.
56066 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
12932 /* overallocate to limit the number of realloc() */
12933 56066 newlen += newlen / OVERALLOCATE_FACTOR;
12934 }
12935
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 126278 times.
126278 if (newlen < writer->min_length)
12936 newlen = writer->min_length;
12937
12938
3/4
✓ Branch 0 taken 126233 times.
✓ Branch 1 taken 45 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 126233 times.
126278 if (maxchar > writer->maxchar || writer->readonly) {
12939 /* resize + widen */
12940 45 maxchar = Py_MAX(maxchar, writer->maxchar);
12941 45 newbuffer = PyUnicode_New(newlen, maxchar);
12942
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 45 times.
45 if (newbuffer == NULL)
12943 return -1;
12944 45 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12945 writer->buffer, 0, writer->pos);
12946 45 Py_DECREF(writer->buffer);
12947 45 writer->readonly = 0;
12948 }
12949 else {
12950 126233 newbuffer = resize_compact(writer->buffer, newlen);
12951
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 126233 times.
126233 if (newbuffer == NULL)
12952 return -1;
12953 }
12954 126278 writer->buffer = newbuffer;
12955 }
12956
1/2
✓ Branch 0 taken 1801923 times.
✗ Branch 1 not taken.
1801923 else if (maxchar > writer->maxchar) {
12957 assert(!writer->readonly);
12958 1801923 newbuffer = PyUnicode_New(writer->size, maxchar);
12959
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1801923 times.
1801923 if (newbuffer == NULL)
12960 return -1;
12961 1801923 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12962 writer->buffer, 0, writer->pos);
12963 1801923 Py_SETREF(writer->buffer, newbuffer);
12964 }
12965 76620282 _PyUnicodeWriter_Update(writer);
12966 76620282 return 0;
12967
12968 #undef OVERALLOCATE_FACTOR
12969 }
12970
12971 int
12972 1 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
12973 int kind)
12974 {
12975 Py_UCS4 maxchar;
12976
12977 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
12978 assert(writer->kind < kind);
12979
12980
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
1 switch (kind)
12981 {
12982 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
12983 1 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
12984 case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
12985 default:
12986 Py_UNREACHABLE();
12987 }
12988
12989 1 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
12990 }
12991
12992 static inline int
12993 1847390 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
12994 {
12995 assert(ch <= MAX_UNICODE);
12996
5/6
✓ Branch 0 taken 33184 times.
✓ Branch 1 taken 1814206 times.
✓ Branch 2 taken 17031 times.
✓ Branch 3 taken 16153 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 1831237 times.
1847390 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
12997 return -1;
12998 1847390 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
12999 1847390 writer->pos++;
13000 1847390 return 0;
13001 }
13002
13003 int
13004 27536 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13005 {
13006 27536 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13007 }
13008
13009 int
13010 91342396 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13011 {
13012 Py_UCS4 maxchar;
13013 Py_ssize_t len;
13014
13015 91342396 len = PyUnicode_GET_LENGTH(str);
13016
2/2
✓ Branch 0 taken 21450 times.
✓ Branch 1 taken 91320946 times.
91342396 if (len == 0)
13017 21450 return 0;
13018 91320946 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13019
4/4
✓ Branch 0 taken 89148719 times.
✓ Branch 1 taken 2172227 times.
✓ Branch 2 taken 100569 times.
✓ Branch 3 taken 89048150 times.
91320946 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13020
4/4
✓ Branch 0 taken 2168224 times.
✓ Branch 1 taken 104572 times.
✓ Branch 2 taken 252827 times.
✓ Branch 3 taken 1915397 times.
2272796 if (writer->buffer == NULL && !writer->overallocate) {
13021 assert(_PyUnicode_CheckConsistency(str, 1));
13022 252827 writer->readonly = 1;
13023 252827 Py_INCREF(str);
13024 252827 writer->buffer = str;
13025 252827 _PyUnicodeWriter_Update(writer);
13026 252827 writer->pos += len;
13027 252827 return 0;
13028 }
13029
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 2019969 times.
2019969 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13030 return -1;
13031 }
13032 91068119 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13033 str, 0, len);
13034 91068119 writer->pos += len;
13035 91068119 return 0;
13036 }
13037
13038 int
13039 18108728 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13040 Py_ssize_t start, Py_ssize_t end)
13041 {
13042 Py_UCS4 maxchar;
13043 Py_ssize_t len;
13044
13045 assert(0 <= start);
13046 assert(end <= PyUnicode_GET_LENGTH(str));
13047 assert(start <= end);
13048
13049
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 18108728 times.
18108728 if (end == 0)
13050 return 0;
13051
13052
4/4
✓ Branch 0 taken 10995528 times.
✓ Branch 1 taken 7113200 times.
✓ Branch 3 taken 8338 times.
✓ Branch 4 taken 10987190 times.
18108728 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13053 8338 return _PyUnicodeWriter_WriteStr(writer, str);
13054
13055
2/2
✓ Branch 1 taken 10987785 times.
✓ Branch 2 taken 7112605 times.
18100390 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13056 10987785 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13057 else
13058 7112605 maxchar = writer->maxchar;
13059 18100390 len = end - start;
13060
13061
6/8
✓ Branch 0 taken 7112605 times.
✓ Branch 1 taken 10987785 times.
✓ Branch 2 taken 2858 times.
✓ Branch 3 taken 7109747 times.
✓ Branch 4 taken 10990643 times.
✗ Branch 5 not taken.
✗ Branch 7 not taken.
✓ Branch 8 taken 10990643 times.
18100390 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13062 return -1;
13063
13064 18100390 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13065 str, start, len);
13066 18100390 writer->pos += len;
13067 18100390 return 0;
13068 }
13069
13070 int
13071 163318995 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13072 const char *ascii, Py_ssize_t len)
13073 {
13074
2/2
✓ Branch 0 taken 551 times.
✓ Branch 1 taken 163318444 times.
163318995 if (len == -1)
13075 551 len = strlen(ascii);
13076
13077 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13078
13079
3/4
✓ Branch 0 taken 60467799 times.
✓ Branch 1 taken 102851196 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 60467799 times.
163318995 if (writer->buffer == NULL && !writer->overallocate) {
13080 PyObject *str;
13081
13082 str = _PyUnicode_FromASCII(ascii, len);
13083 if (str == NULL)
13084 return -1;
13085
13086 writer->readonly = 1;
13087 writer->buffer = str;
13088 _PyUnicodeWriter_Update(writer);
13089 writer->pos += len;
13090 return 0;
13091 }
13092
13093
6/8
✓ Branch 0 taken 102851196 times.
✓ Branch 1 taken 60467799 times.
✓ Branch 2 taken 5776 times.
✓ Branch 3 taken 102845420 times.
✓ Branch 4 taken 60473575 times.
✗ Branch 5 not taken.
✗ Branch 7 not taken.
✓ Branch 8 taken 60473575 times.
163318995 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13094 return -1;
13095
13096
3/4
✓ Branch 0 taken 163318619 times.
✓ Branch 1 taken 292 times.
✓ Branch 2 taken 84 times.
✗ Branch 3 not taken.
163318995 switch (writer->kind)
13097 {
13098 163318619 case PyUnicode_1BYTE_KIND:
13099 {
13100 163318619 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13101 163318619 Py_UCS1 *data = writer->data;
13102
13103 163318619 memcpy(data + writer->pos, str, len);
13104 163318619 break;
13105 }
13106 292 case PyUnicode_2BYTE_KIND:
13107 {
13108
3/4
✗ Branch 0 not taken.
✓ Branch 1 taken 292 times.
✓ Branch 2 taken 584 times.
✓ Branch 3 taken 292 times.
876 _PyUnicode_CONVERT_BYTES(
13109 Py_UCS1, Py_UCS2,
13110 ascii, ascii + len,
13111 (Py_UCS2 *)writer->data + writer->pos);
13112 292 break;
13113 }
13114 84 case PyUnicode_4BYTE_KIND:
13115 {
13116
3/4
✗ Branch 0 not taken.
✓ Branch 1 taken 84 times.
✓ Branch 2 taken 168 times.
✓ Branch 3 taken 84 times.
252 _PyUnicode_CONVERT_BYTES(
13117 Py_UCS1, Py_UCS4,
13118 ascii, ascii + len,
13119 (Py_UCS4 *)writer->data + writer->pos);
13120 84 break;
13121 }
13122 default:
13123 Py_UNREACHABLE();
13124 }
13125
13126 163318995 writer->pos += len;
13127 163318995 return 0;
13128 }
13129
13130 int
13131 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13132 const char *str, Py_ssize_t len)
13133 {
13134 Py_UCS4 maxchar;
13135
13136 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
13137 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13138 return -1;
13139 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13140 writer->pos += len;
13141 return 0;
13142 }
13143
13144 PyObject *
13145 76740477 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13146 {
13147 PyObject *str;
13148
13149
2/2
✓ Branch 0 taken 27048 times.
✓ Branch 1 taken 76713429 times.
76740477 if (writer->pos == 0) {
13150
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 27048 times.
27048 Py_CLEAR(writer->buffer);
13151 27048 _Py_RETURN_UNICODE_EMPTY();
13152 }
13153
13154 76713429 str = writer->buffer;
13155 76713429 writer->buffer = NULL;
13156
13157
2/2
✓ Branch 0 taken 252827 times.
✓ Branch 1 taken 76460602 times.
76713429 if (writer->readonly) {
13158 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13159 252827 return str;
13160 }
13161
13162
2/2
✓ Branch 1 taken 75357188 times.
✓ Branch 2 taken 1103414 times.
76460602 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13163 PyObject *str2;
13164 75357188 str2 = resize_compact(str, writer->pos);
13165
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 75357188 times.
75357188 if (str2 == NULL) {
13166 Py_DECREF(str);
13167 return NULL;
13168 }
13169 75357188 str = str2;
13170 }
13171
13172 assert(_PyUnicode_CheckConsistency(str, 1));
13173 76460602 return unicode_result(str);
13174 }
13175
13176 void
13177 42 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13178 {
13179
1/2
✓ Branch 0 taken 42 times.
✗ Branch 1 not taken.
42 Py_CLEAR(writer->buffer);
13180 42 }
13181
13182 #include "stringlib/unicode_format.h"
13183
13184 PyDoc_STRVAR(format__doc__,
13185 "S.format(*args, **kwargs) -> str\n\
13186 \n\
13187 Return a formatted version of S, using substitutions from args and kwargs.\n\
13188 The substitutions are identified by braces ('{' and '}').");
13189
13190 PyDoc_STRVAR(format_map__doc__,
13191 "S.format_map(mapping) -> str\n\
13192 \n\
13193 Return a formatted version of S, using substitutions from mapping.\n\
13194 The substitutions are identified by braces ('{' and '}').");
13195
13196 /*[clinic input]
13197 str.__format__ as unicode___format__
13198
13199 format_spec: unicode
13200 /
13201
13202 Return a formatted version of the string as described by format_spec.
13203 [clinic start generated code]*/
13204
13205 static PyObject *
13206 3728 unicode___format___impl(PyObject *self, PyObject *format_spec)
13207 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13208 {
13209 _PyUnicodeWriter writer;
13210 int ret;
13211
13212 3728 _PyUnicodeWriter_Init(&writer);
13213 3728 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13214 self, format_spec, 0,
13215 PyUnicode_GET_LENGTH(format_spec));
13216
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3728 times.
3728 if (ret == -1) {
13217 _PyUnicodeWriter_Dealloc(&writer);
13218 return NULL;
13219 }
13220 3728 return _PyUnicodeWriter_Finish(&writer);
13221 }
13222
13223 /*[clinic input]
13224 str.__sizeof__ as unicode_sizeof
13225
13226 Return the size of the string in memory, in bytes.
13227 [clinic start generated code]*/
13228
13229 static PyObject *
13230 unicode_sizeof_impl(PyObject *self)
13231 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13232 {
13233 Py_ssize_t size;
13234
13235 /* If it's a compact object, account for base structure +
13236 character data. */
13237 if (PyUnicode_IS_COMPACT_ASCII(self)) {
13238 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13239 }
13240 else if (PyUnicode_IS_COMPACT(self)) {
13241 size = sizeof(PyCompactUnicodeObject) +
13242 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13243 }
13244 else {
13245 /* If it is a two-block object, account for base object, and
13246 for character block if present. */
13247 size = sizeof(PyUnicodeObject);
13248 if (_PyUnicode_DATA_ANY(self))
13249 size += (PyUnicode_GET_LENGTH(self) + 1) *
13250 PyUnicode_KIND(self);
13251 }
13252 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13253 size += PyUnicode_UTF8_LENGTH(self) + 1;
13254
13255 return PyLong_FromSsize_t(size);
13256 }
13257
13258 static PyObject *
13259 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13260 {
13261 PyObject *copy = _PyUnicode_Copy(v);
13262 if (!copy)
13263 return NULL;
13264 return Py_BuildValue("(N)", copy);
13265 }
13266
13267 static PyMethodDef unicode_methods[] = {
13268 UNICODE_ENCODE_METHODDEF
13269 UNICODE_REPLACE_METHODDEF
13270 UNICODE_SPLIT_METHODDEF
13271 UNICODE_RSPLIT_METHODDEF
13272 UNICODE_JOIN_METHODDEF
13273 UNICODE_CAPITALIZE_METHODDEF
13274 UNICODE_CASEFOLD_METHODDEF
13275 UNICODE_TITLE_METHODDEF
13276 UNICODE_CENTER_METHODDEF
13277 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13278 UNICODE_EXPANDTABS_METHODDEF
13279 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13280 UNICODE_PARTITION_METHODDEF
13281 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13282 UNICODE_LJUST_METHODDEF
13283 UNICODE_LOWER_METHODDEF
13284 UNICODE_LSTRIP_METHODDEF
13285 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13286 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13287 UNICODE_RJUST_METHODDEF
13288 UNICODE_RSTRIP_METHODDEF
13289 UNICODE_RPARTITION_METHODDEF
13290 UNICODE_SPLITLINES_METHODDEF
13291 UNICODE_STRIP_METHODDEF
13292 UNICODE_SWAPCASE_METHODDEF
13293 UNICODE_TRANSLATE_METHODDEF
13294 UNICODE_UPPER_METHODDEF
13295 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13296 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13297 UNICODE_REMOVEPREFIX_METHODDEF
13298 UNICODE_REMOVESUFFIX_METHODDEF
13299 UNICODE_ISASCII_METHODDEF
13300 UNICODE_ISLOWER_METHODDEF
13301 UNICODE_ISUPPER_METHODDEF
13302 UNICODE_ISTITLE_METHODDEF
13303 UNICODE_ISSPACE_METHODDEF
13304 UNICODE_ISDECIMAL_METHODDEF
13305 UNICODE_ISDIGIT_METHODDEF
13306 UNICODE_ISNUMERIC_METHODDEF
13307 UNICODE_ISALPHA_METHODDEF
13308 UNICODE_ISALNUM_METHODDEF
13309 UNICODE_ISIDENTIFIER_METHODDEF
13310 UNICODE_ISPRINTABLE_METHODDEF
13311 UNICODE_ZFILL_METHODDEF
13312 {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13313 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13314 UNICODE___FORMAT___METHODDEF
13315 UNICODE_MAKETRANS_METHODDEF
13316 UNICODE_SIZEOF_METHODDEF
13317 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
13318 {NULL, NULL}
13319 };
13320
13321 static PyObject *
13322 11075364 unicode_mod(PyObject *v, PyObject *w)
13323 {
13324
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 11075364 times.
11075364 if (!PyUnicode_Check(v))
13325 Py_RETURN_NOTIMPLEMENTED;
13326 11075364 return PyUnicode_Format(v, w);
13327 }
13328
13329 static PyNumberMethods unicode_as_number = {
13330 0, /*nb_add*/
13331 0, /*nb_subtract*/
13332 0, /*nb_multiply*/
13333 unicode_mod, /*nb_remainder*/
13334 };
13335
13336 static PySequenceMethods unicode_as_sequence = {
13337 (lenfunc) unicode_length, /* sq_length */
13338 PyUnicode_Concat, /* sq_concat */
13339 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13340 (ssizeargfunc) unicode_getitem, /* sq_item */
13341 0, /* sq_slice */
13342 0, /* sq_ass_item */
13343 0, /* sq_ass_slice */
13344 PyUnicode_Contains, /* sq_contains */
13345 };
13346
13347 static PyObject*
13348 65123464 unicode_subscript(PyObject* self, PyObject* item)
13349 {
13350
2/2
✓ Branch 1 taken 55122851 times.
✓ Branch 2 taken 10000613 times.
65123464 if (_PyIndex_Check(item)) {
13351 55122851 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13352
3/4
✓ Branch 0 taken 735111 times.
✓ Branch 1 taken 54387740 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 735111 times.
55122851 if (i == -1 && PyErr_Occurred())
13353 return NULL;
13354
2/2
✓ Branch 0 taken 958317 times.
✓ Branch 1 taken 54164534 times.
55122851 if (i < 0)
13355 958317 i += PyUnicode_GET_LENGTH(self);
13356 55122851 return unicode_getitem(self, i);
13357
1/2
✓ Branch 1 taken 10000613 times.
✗ Branch 2 not taken.
10000613 } else if (PySlice_Check(item)) {
13358 Py_ssize_t start, stop, step, slicelength, i;
13359 size_t cur;
13360 PyObject *result;
13361 const void *src_data;
13362 void *dest_data;
13363 int src_kind, dest_kind;
13364 Py_UCS4 ch, max_char, kind_limit;
13365
13366
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 10000613 times.
10000613 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13367 return NULL;
13368 }
13369 10000613 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13370 &start, &stop, step);
13371
13372
2/2
✓ Branch 0 taken 124246 times.
✓ Branch 1 taken 9876367 times.
10000613 if (slicelength <= 0) {
13373 124246 _Py_RETURN_UNICODE_EMPTY();
13374
6/6
✓ Branch 0 taken 4800101 times.
✓ Branch 1 taken 5076266 times.
✓ Branch 2 taken 4782318 times.
✓ Branch 3 taken 17783 times.
✓ Branch 4 taken 955336 times.
✓ Branch 5 taken 3826982 times.
14658685 } else if (start == 0 && step == 1 &&
13375 4782318 slicelength == PyUnicode_GET_LENGTH(self)) {
13376 955336 return unicode_result_unchanged(self);
13377
2/2
✓ Branch 0 taken 8900266 times.
✓ Branch 1 taken 20765 times.
8921031 } else if (step == 1) {
13378 8900266 return PyUnicode_Substring(self,
13379 start, start + slicelength);
13380 }
13381 /* General case */
13382 20765 src_kind = PyUnicode_KIND(self);
13383 20765 src_data = PyUnicode_DATA(self);
13384
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 20765 times.
20765 if (!PyUnicode_IS_ASCII(self)) {
13385 kind_limit = kind_maxchar_limit(src_kind);
13386 max_char = 0;
13387 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13388 ch = PyUnicode_READ(src_kind, src_data, cur);
13389 if (ch > max_char) {
13390 max_char = ch;
13391 if (max_char >= kind_limit)
13392 break;
13393 }
13394 }
13395 }
13396 else
13397 20765 max_char = 127;
13398 20765 result = PyUnicode_New(slicelength, max_char);
13399
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 20765 times.
20765 if (result == NULL)
13400 return NULL;
13401 20765 dest_kind = PyUnicode_KIND(result);
13402 20765 dest_data = PyUnicode_DATA(result);
13403
13404
2/2
✓ Branch 0 taken 174964 times.
✓ Branch 1 taken 20765 times.
195729 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13405 174964 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13406 174964 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13407 }
13408 assert(_PyUnicode_CheckConsistency(result, 1));
13409 20765 return result;
13410 } else {
13411 PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13412 Py_TYPE(item)->tp_name);
13413 return NULL;
13414 }
13415 }
13416
13417 static PyMappingMethods unicode_as_mapping = {
13418 (lenfunc)unicode_length, /* mp_length */
13419 (binaryfunc)unicode_subscript, /* mp_subscript */
13420 (objobjargproc)0, /* mp_ass_subscript */
13421 };
13422
13423
13424 /* Helpers for PyUnicode_Format() */
13425
13426 struct unicode_formatter_t {
13427 PyObject *args;
13428 int args_owned;
13429 Py_ssize_t arglen, argidx;
13430 PyObject *dict;
13431
13432 int fmtkind;
13433 Py_ssize_t fmtcnt, fmtpos;
13434 const void *fmtdata;
13435 PyObject *fmtstr;
13436
13437 _PyUnicodeWriter writer;
13438 };
13439
13440 struct unicode_format_arg_t {
13441 Py_UCS4 ch;
13442 int flags;
13443 Py_ssize_t width;
13444 int prec;
13445 int sign;
13446 };
13447
13448 static PyObject *
13449 17186411 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
13450 {
13451 17186411 Py_ssize_t argidx = ctx->argidx;
13452
13453
1/2
✓ Branch 0 taken 17186411 times.
✗ Branch 1 not taken.
17186411 if (argidx < ctx->arglen) {
13454 17186411 ctx->argidx++;
13455
2/2
✓ Branch 0 taken 5055889 times.
✓ Branch 1 taken 12130522 times.
17186411 if (ctx->arglen < 0)
13456 5055889 return ctx->args;
13457 else
13458 12130522 return PyTuple_GetItem(ctx->args, argidx);
13459 }
13460 PyErr_SetString(PyExc_TypeError,
13461 "not enough arguments for format string");
13462 return NULL;
13463 }
13464
13465 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
13466
13467 /* Format a float into the writer if the writer is not NULL, or into *p_output
13468 otherwise.
13469
13470 Return 0 on success, raise an exception and return -1 on error. */
13471 static int
13472 3177 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13473 PyObject **p_output,
13474 _PyUnicodeWriter *writer)
13475 {
13476 char *p;
13477 double x;
13478 Py_ssize_t len;
13479 int prec;
13480 3177 int dtoa_flags = 0;
13481
13482 3177 x = PyFloat_AsDouble(v);
13483
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 3177 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
3177 if (x == -1.0 && PyErr_Occurred())
13484 return -1;
13485
13486 3177 prec = arg->prec;
13487
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3177 times.
3177 if (prec < 0)
13488 prec = 6;
13489
13490
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3177 times.
3177 if (arg->flags & F_ALT)
13491 dtoa_flags |= Py_DTSF_ALT;
13492
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3177 times.
3177 if (arg->flags & F_NO_NEG_0)
13493 dtoa_flags |= Py_DTSF_NO_NEG_0;
13494 3177 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
13495
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3177 times.
3177 if (p == NULL)
13496 return -1;
13497 3177 len = strlen(p);
13498
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3177 times.
3177 if (writer) {
13499 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
13500 PyMem_Free(p);
13501 return -1;
13502 }
13503 }
13504 else
13505 3177 *p_output = _PyUnicode_FromASCII(p, len);
13506 3177 PyMem_Free(p);
13507 3177 return 0;
13508 }
13509
13510 /* formatlong() emulates the format codes d, u, o, x and X, and
13511 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13512 * Python's regular ints.
13513 * Return value: a new PyUnicodeObject*, or NULL if error.
13514 * The output string is of the form
13515 * "-"? ("0x" | "0X")? digit+
13516 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13517 * set in flags. The case of hex digits will be correct,
13518 * There will be at least prec digits, zero-filled on the left if
13519 * necessary to get that many.
13520 * val object to be converted
13521 * flags bitmask of format flags; only F_ALT is looked at
13522 * prec minimum number of digits; 0-fill on left if needed
13523 * type a character in [duoxX]; u acts the same as d
13524 *
13525 * CAUTION: o, x and X conversions on regular ints can never
13526 * produce a '-' sign, but can for Python's unbounded ints.
13527 */
13528 PyObject *
13529 233256 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
13530 {
13531 233256 PyObject *result = NULL;
13532 char *buf;
13533 Py_ssize_t i;
13534 int sign; /* 1 if '-', else 0 */
13535 int len; /* number of characters */
13536 Py_ssize_t llen;
13537 int numdigits; /* len == numnondigits + numdigits */
13538 233256 int numnondigits = 0;
13539
13540 /* Avoid exceeding SSIZE_T_MAX */
13541
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 233256 times.
233256 if (prec > INT_MAX-3) {
13542 PyErr_SetString(PyExc_OverflowError,
13543 "precision too large");
13544 return NULL;
13545 }
13546
13547 assert(PyLong_Check(val));
13548
13549
3/4
✗ Branch 0 not taken.
✓ Branch 1 taken 79800 times.
✓ Branch 2 taken 45705 times.
✓ Branch 3 taken 107751 times.
233256 switch (type) {
13550 default:
13551 Py_UNREACHABLE();
13552 79800 case 'd':
13553 case 'i':
13554 case 'u':
13555 /* int and int subclasses should print numerically when a numeric */
13556 /* format code is used (see issue18780) */
13557 79800 result = PyNumber_ToBase(val, 10);
13558 79800 break;
13559 45705 case 'o':
13560 45705 numnondigits = 2;
13561 45705 result = PyNumber_ToBase(val, 8);
13562 45705 break;
13563 107751 case 'x':
13564 case 'X':
13565 107751 numnondigits = 2;
13566 107751 result = PyNumber_ToBase(val, 16);
13567 107751 break;
13568 }
13569
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 233256 times.
233256 if (!result)
13570 return NULL;
13571
13572 assert(unicode_modifiable(result));
13573 assert(PyUnicode_IS_ASCII(result));
13574
13575 /* To modify the string in-place, there can only be one reference. */
13576
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 233256 times.
233256 if (Py_REFCNT(result) != 1) {
13577 Py_DECREF(result);
13578 PyErr_BadInternalCall();
13579 return NULL;
13580 }
13581 233256 buf = PyUnicode_DATA(result);
13582 233256 llen = PyUnicode_GET_LENGTH(result);
13583
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 233256 times.
233256 if (llen > INT_MAX) {
13584 Py_DECREF(result);
13585 PyErr_SetString(PyExc_ValueError,
13586 "string too large in _PyUnicode_FormatLong");
13587 return NULL;
13588 }
13589 233256 len = (int)llen;
13590 233256 sign = buf[0] == '-';
13591 233256 numnondigits += sign;
13592 233256 numdigits = len - numnondigits;
13593 assert(numdigits > 0);
13594
13595 /* Get rid of base marker unless F_ALT */
13596
3/4
✓ Branch 0 taken 233256 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 187551 times.
✓ Branch 3 taken 45705 times.
233256 if (((alt) == 0 &&
13597
4/4
✓ Branch 0 taken 184904 times.
✓ Branch 1 taken 2647 times.
✓ Branch 2 taken 105104 times.
✓ Branch 3 taken 79800 times.
187551 (type == 'o' || type == 'x' || type == 'X'))) {
13598 assert(buf[sign] == '0');
13599 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13600 buf[sign+1] == 'o');
13601 153456 numnondigits -= 2;
13602 153456 buf += 2;
13603 153456 len -= 2;
13604
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 153456 times.
153456 if (sign)
13605 buf[0] = '-';
13606 assert(len == numnondigits + numdigits);
13607 assert(numdigits > 0);
13608 }
13609
13610 /* Fill with leading zeroes to meet minimum width. */
13611
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 233256 times.
233256 if (prec > numdigits) {
13612 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13613 numnondigits + prec);
13614 char *b1;
13615 if (!r1) {
13616 Py_DECREF(result);
13617 return NULL;
13618 }
13619 b1 = PyBytes_AS_STRING(r1);
13620 for (i = 0; i < numnondigits; ++i)
13621 *b1++ = *buf++;
13622 for (i = 0; i < prec - numdigits; i++)
13623 *b1++ = '0';
13624 for (i = 0; i < numdigits; i++)
13625 *b1++ = *buf++;
13626 *b1 = '\0';
13627 Py_DECREF(result);
13628 result = r1;
13629 buf = PyBytes_AS_STRING(result);
13630 len = numnondigits + prec;
13631 }
13632
13633 /* Fix up case for hex conversions. */
13634
2/2
✓ Branch 0 taken 105104 times.
✓ Branch 1 taken 128152 times.
233256 if (type == 'X') {
13635 /* Need to convert all lower case letters to upper case.
13636 and need to convert 0x to 0X (and -0x to -0X). */
13637
2/2
✓ Branch 0 taken 203408 times.
✓ Branch 1 taken 105104 times.
308512 for (i = 0; i < len; i++)
13638
3/4
✓ Branch 0 taken 78036 times.
✓ Branch 1 taken 125372 times.
✓ Branch 2 taken 78036 times.
✗ Branch 3 not taken.
203408 if (buf[i] >= 'a' && buf[i] <= 'x')
13639 78036 buf[i] -= 'a'-'A';
13640 }
13641
1/2
✓ Branch 2 taken 233256 times.
✗ Branch 3 not taken.
233256 if (!PyUnicode_Check(result)
13642
2/2
✓ Branch 1 taken 153456 times.
✓ Branch 2 taken 79800 times.
233256 || buf != PyUnicode_DATA(result)) {
13643 PyObject *unicode;
13644 153456 unicode = _PyUnicode_FromASCII(buf, len);
13645 153456 Py_DECREF(result);
13646 153456 result = unicode;
13647 }
13648
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 79800 times.
79800 else if (len != PyUnicode_GET_LENGTH(result)) {
13649 if (PyUnicode_Resize(&result, len) < 0)
13650 Py_CLEAR(result);
13651 }
13652 233256 return result;
13653 }
13654
13655 /* Format an integer or a float as an integer.
13656 * Return 1 if the number has been formatted into the writer,
13657 * 0 if the number has been formatted into *p_output
13658 * -1 and raise an exception on error */
13659 static int
13660 12210186 mainformatlong(PyObject *v,
13661 struct unicode_format_arg_t *arg,
13662 PyObject **p_output,
13663 _PyUnicodeWriter *writer)
13664 {
13665 PyObject *iobj, *res;
13666 12210186 char type = (char)arg->ch;
13667
13668
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 12210186 times.
12210186 if (!PyNumber_Check(v))
13669 goto wrongtype;
13670
13671 /* make sure number is a type of integer for o, x, and X */
13672
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 12210186 times.
12210186 if (!PyLong_Check(v)) {
13673 if (type == 'o' || type == 'x' || type == 'X') {
13674 iobj = _PyNumber_Index(v);
13675 }
13676 else {
13677 iobj = PyNumber_Long(v);
13678 }
13679 if (iobj == NULL ) {
13680 if (PyErr_ExceptionMatches(PyExc_TypeError))
13681 goto wrongtype;
13682 return -1;
13683 }
13684 assert(PyLong_Check(iobj));
13685 }
13686 else {
13687 12210186 iobj = v;
13688 12210186 Py_INCREF(iobj);
13689 }
13690
13691
1/2
✓ Branch 1 taken 12210186 times.
✗ Branch 2 not taken.
12210186 if (PyLong_CheckExact(v)
13692
3/4
✓ Branch 0 taken 11976930 times.
✓ Branch 1 taken 233256 times.
✓ Branch 2 taken 11976930 times.
✗ Branch 3 not taken.
12210186 && arg->width == -1 && arg->prec == -1
13693
1/2
✓ Branch 0 taken 11976930 times.
✗ Branch 1 not taken.
11976930 && !(arg->flags & (F_SIGN | F_BLANK))
13694
1/2
✓ Branch 0 taken 11976930 times.
✗ Branch 1 not taken.
11976930 && type != 'X')
13695 {
13696 /* Fast path */
13697 11976930 int alternate = arg->flags & F_ALT;
13698 int base;
13699
13700
3/4
✗ Branch 0 not taken.
✓ Branch 1 taken 11913718 times.
✓ Branch 2 taken 14 times.
✓ Branch 3 taken 63198 times.
11976930 switch(type)
13701 {
13702 default:
13703 Py_UNREACHABLE();
13704 11913718 case 'd':
13705 case 'i':
13706 case 'u':
13707 11913718 base = 10;
13708 11913718 break;
13709 14 case 'o':
13710 14 base = 8;
13711 14 break;
13712 63198 case 'x':
13713 case 'X':
13714 63198 base = 16;
13715 63198 break;
13716 }
13717
13718
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 11976930 times.
11976930 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13719 Py_DECREF(iobj);
13720 return -1;
13721 }
13722 11976930 Py_DECREF(iobj);
13723 11976930 return 1;
13724 }
13725
13726 233256 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
13727 233256 Py_DECREF(iobj);
13728
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 233256 times.
233256 if (res == NULL)
13729 return -1;
13730 233256 *p_output = res;
13731 233256 return 0;
13732
13733 wrongtype:
13734 switch(type)
13735 {
13736 case 'o':
13737 case 'x':
13738 case 'X':
13739 PyErr_Format(PyExc_TypeError,
13740 "%%%c format: an integer is required, "
13741 "not %.200s",
13742 type, Py_TYPE(v)->tp_name);
13743 break;
13744 default:
13745 PyErr_Format(PyExc_TypeError,
13746 "%%%c format: a real number is required, "
13747 "not %.200s",
13748 type, Py_TYPE(v)->tp_name);
13749 break;
13750 }
13751 return -1;
13752 }
13753
13754 static Py_UCS4
13755 704 formatchar(PyObject *v)
13756 {
13757 /* presume that the buffer is at least 3 characters long */
13758
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 704 times.
704 if (PyUnicode_Check(v)) {
13759 if (PyUnicode_GET_LENGTH(v) == 1) {
13760 return PyUnicode_READ_CHAR(v, 0);
13761 }
13762 goto onError;
13763 }
13764 else {
13765 int overflow;
13766 704 long x = PyLong_AsLongAndOverflow(v, &overflow);
13767
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 704 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
704 if (x == -1 && PyErr_Occurred()) {
13768 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
13769 goto onError;
13770 }
13771 704 return (Py_UCS4) -1;
13772 }
13773
13774
2/4
✓ Branch 0 taken 704 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 704 times.
704 if (x < 0 || x > MAX_UNICODE) {
13775 /* this includes an overflow in converting to C long */
13776 PyErr_SetString(PyExc_OverflowError,
13777 "%c arg not in range(0x110000)");
13778 return (Py_UCS4) -1;
13779 }
13780
13781 704 return (Py_UCS4) x;
13782 }
13783
13784 onError:
13785 PyErr_SetString(PyExc_TypeError,
13786 "%c requires int or char");
13787 return (Py_UCS4) -1;
13788 }
13789
13790 /* Parse options of an argument: flags, width, precision.
13791 Handle also "%(name)" syntax.
13792
13793 Return 0 if the argument has been formatted into arg->str.
13794 Return 1 if the argument has been written into ctx->writer,
13795 Raise an exception and return -1 on error. */
13796 static int
13797 17186411 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13798 struct unicode_format_arg_t *arg)
13799 {
13800 #define FORMAT_READ(ctx) \
13801 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13802
13803 PyObject *v;
13804
13805
2/2
✓ Branch 0 taken 367447 times.
✓ Branch 1 taken 16818964 times.
17186411 if (arg->ch == '(') {
13806 /* Get argument value from a dictionary. Example: "%(name)s". */
13807 Py_ssize_t keystart;
13808 Py_ssize_t keylen;
13809 PyObject *key;
13810 367447 int pcount = 1;
13811
13812
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 367447 times.
367447 if (ctx->dict == NULL) {
13813 PyErr_SetString(PyExc_TypeError,
13814 "format requires a mapping");
13815 return -1;
13816 }
13817 367447 ++ctx->fmtpos;
13818 367447 --ctx->fmtcnt;
13819 367447 keystart = ctx->fmtpos;
13820 /* Skip over balanced parentheses */
13821
3/4
✓ Branch 0 taken 2779641 times.
✓ Branch 1 taken 367447 times.
✓ Branch 2 taken 2779641 times.
✗ Branch 3 not taken.
3147088 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13822 2779641 arg->ch = FORMAT_READ(ctx);
13823
2/2
✓ Branch 0 taken 367447 times.
✓ Branch 1 taken 2412194 times.
2779641 if (arg->ch == ')')
13824 367447 --pcount;
13825
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2412194 times.
2412194 else if (arg->ch == '(')
13826 ++pcount;
13827 2779641 ctx->fmtpos++;
13828 }
13829 367447 keylen = ctx->fmtpos - keystart - 1;
13830
2/4
✓ Branch 0 taken 367447 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 367447 times.
367447 if (ctx->fmtcnt < 0 || pcount > 0) {
13831 PyErr_SetString(PyExc_ValueError,
13832 "incomplete format key");
13833 return -1;
13834 }
13835 367447 key = PyUnicode_Substring(ctx->fmtstr,
13836 keystart, keystart + keylen);
13837
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 367447 times.
367447 if (key == NULL)
13838 return -1;
13839
2/2
✓ Branch 0 taken 113178 times.
✓ Branch 1 taken 254269 times.
367447 if (ctx->args_owned) {
13840 113178 ctx->args_owned = 0;
13841 113178 Py_DECREF(ctx->args);
13842 }
13843 367447 ctx->args = PyObject_GetItem(ctx->dict, key);
13844 367447 Py_DECREF(key);
13845
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 367447 times.
367447 if (ctx->args == NULL)
13846 return -1;
13847 367447 ctx->args_owned = 1;
13848 367447 ctx->arglen = -1;
13849 367447 ctx->argidx = -2;
13850 }
13851
13852 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
13853
1/2
✓ Branch 0 taken 17419667 times.
✗ Branch 1 not taken.
17419667 while (--ctx->fmtcnt >= 0) {
13854 17419667 arg->ch = FORMAT_READ(ctx);
13855 17419667 ctx->fmtpos++;
13856
2/6
✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 233256 times.
✓ Branch 5 taken 17186411 times.
17419667 switch (arg->ch) {
13857 case '-': arg->flags |= F_LJUST; continue;
13858 case '+': arg->flags |= F_SIGN; continue;
13859 case ' ': arg->flags |= F_BLANK; continue;
13860 case '#': arg->flags |= F_ALT; continue;
13861 233256 case '0': arg->flags |= F_ZERO; continue;
13862 }
13863 17186411 break;
13864 }
13865
13866 /* Parse width. Example: "%10s" => width=10 */
13867
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 17186411 times.
17186411 if (arg->ch == '*') {
13868 v = unicode_format_getnextarg(ctx);
13869 if (v == NULL)
13870 return -1;
13871 if (!PyLong_Check(v)) {
13872 PyErr_SetString(PyExc_TypeError,
13873 "* wants int");
13874 return -1;
13875 }
13876 arg->width = PyLong_AsSsize_t(v);
13877 if (arg->width == -1 && PyErr_Occurred())
13878 return -1;
13879 if (arg->width < 0) {
13880 arg->flags |= F_LJUST;
13881 arg->width = -arg->width;
13882 }
13883 if (--ctx->fmtcnt >= 0) {
13884 arg->ch = FORMAT_READ(ctx);
13885 ctx->fmtpos++;
13886 }
13887 }
13888
4/4
✓ Branch 0 taken 17183234 times.
✓ Branch 1 taken 3177 times.
✓ Branch 2 taken 234432 times.
✓ Branch 3 taken 16948802 times.
17186411 else if (arg->ch >= '0' && arg->ch <= '9') {
13889 234432 arg->width = arg->ch - '0';
13890
1/2
✓ Branch 0 taken 235616 times.
✗ Branch 1 not taken.
235616 while (--ctx->fmtcnt >= 0) {
13891 235616 arg->ch = FORMAT_READ(ctx);
13892 235616 ctx->fmtpos++;
13893
3/4
✓ Branch 0 taken 235616 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1184 times.
✓ Branch 3 taken 234432 times.
235616 if (arg->ch < '0' || arg->ch > '9')
13894 break;
13895 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13896 mixing signed and unsigned comparison. Since arg->ch is between
13897 '0' and '9', casting to int is safe. */
13898
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1184 times.
1184 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13899 PyErr_SetString(PyExc_ValueError,
13900 "width too big");
13901 return -1;
13902 }
13903 1184 arg->width = arg->width*10 + (arg->ch - '0');
13904 }
13905 }
13906
13907 /* Parse precision. Example: "%.3f" => prec=3 */
13908
2/2
✓ Branch 0 taken 3177 times.
✓ Branch 1 taken 17183234 times.
17186411 if (arg->ch == '.') {
13909 3177 arg->prec = 0;
13910
1/2
✓ Branch 0 taken 3177 times.
✗ Branch 1 not taken.
3177 if (--ctx->fmtcnt >= 0) {
13911 3177 arg->ch = FORMAT_READ(ctx);
13912 3177 ctx->fmtpos++;
13913 }
13914
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3177 times.
3177 if (arg->ch == '*') {
13915 v = unicode_format_getnextarg(ctx);
13916 if (v == NULL)
13917 return -1;
13918 if (!PyLong_Check(v)) {
13919 PyErr_SetString(PyExc_TypeError,
13920 "* wants int");
13921 return -1;
13922 }
13923 arg->prec = _PyLong_AsInt(v);
13924 if (arg->prec == -1 && PyErr_Occurred())
13925 return -1;
13926 if (arg->prec < 0)
13927 arg->prec = 0;
13928 if (--ctx->fmtcnt >= 0) {
13929 arg->ch = FORMAT_READ(ctx);
13930 ctx->fmtpos++;
13931 }
13932 }
13933
2/4
✓ Branch 0 taken 3177 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 3177 times.
✗ Branch 3 not taken.
3177 else if (arg->ch >= '0' && arg->ch <= '9') {
13934 3177 arg->prec = arg->ch - '0';
13935
1/2
✓ Branch 0 taken 3177 times.
✗ Branch 1 not taken.
3177 while (--ctx->fmtcnt >= 0) {
13936 3177 arg->ch = FORMAT_READ(ctx);
13937 3177 ctx->fmtpos++;
13938
2/4
✓ Branch 0 taken 3177 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3177 times.
3177 if (arg->ch < '0' || arg->ch > '9')
13939 break;
13940 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13941 PyErr_SetString(PyExc_ValueError,
13942 "precision too big");
13943 return -1;
13944 }
13945 arg->prec = arg->prec*10 + (arg->ch - '0');
13946 }
13947 }
13948 }
13949
13950 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13951
1/2
✓ Branch 0 taken 17186411 times.
✗ Branch 1 not taken.
17186411 if (ctx->fmtcnt >= 0) {
13952
3/6
✓ Branch 0 taken 17186411 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 17186411 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 17186411 times.
17186411 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13953 if (--ctx->fmtcnt >= 0) {
13954 arg->ch = FORMAT_READ(ctx);
13955 ctx->fmtpos++;
13956 }
13957 }
13958 }
13959
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 17186411 times.
17186411 if (ctx->fmtcnt < 0) {
13960 PyErr_SetString(PyExc_ValueError,
13961 "incomplete format");
13962 return -1;
13963 }
13964 17186411 return 0;
13965
13966 #undef FORMAT_READ
13967 }
13968
13969 /* Format one argument. Supported conversion specifiers:
13970
13971 - "s", "r", "a": any type
13972 - "i", "d", "u": int or float
13973 - "o", "x", "X": int
13974 - "e", "E", "f", "F", "g", "G": float
13975 - "c": int or str (1 character)
13976
13977 When possible, the output is written directly into the Unicode writer
13978 (ctx->writer). A string is created when padding is required.
13979
13980 Return 0 if the argument has been formatted into *p_str,
13981 1 if the argument has been written into ctx->writer,
13982 -1 on error. */
13983 static int
13984 17186411 unicode_format_arg_format(struct unicode_formatter_t *ctx,
13985 struct unicode_format_arg_t *arg,
13986 PyObject **p_str)
13987 {
13988 PyObject *v;
13989 17186411 _PyUnicodeWriter *writer = &ctx->writer;
13990
13991
2/2
✓ Branch 0 taken 10375487 times.
✓ Branch 1 taken 6810924 times.
17186411 if (ctx->fmtcnt == 0)
13992 10375487 ctx->writer.overallocate = 0;
13993
13994 17186411 v = unicode_format_getnextarg(ctx);
13995
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 17186411 times.
17186411 if (v == NULL)
13996 return -1;
13997
13998
13999
4/5
✓ Branch 0 taken 4972344 times.
✓ Branch 1 taken 12210186 times.
✓ Branch 2 taken 3177 times.
✓ Branch 3 taken 704 times.
✗ Branch 4 not taken.
17186411 switch (arg->ch) {
14000 4972344 case 's':
14001 case 'r':
14002 case 'a':
14003
4/6
✓ Branch 1 taken 135958 times.
✓ Branch 2 taken 4836386 times.
✓ Branch 3 taken 135958 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 135958 times.
✗ Branch 6 not taken.
4972344 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14004 /* Fast path */
14005
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 135958 times.
135958 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14006 return -1;
14007 135958 return 1;
14008 }
14009
14010
4/4
✓ Branch 1 taken 4824601 times.
✓ Branch 2 taken 11785 times.
✓ Branch 3 taken 4818871 times.
✓ Branch 4 taken 5730 times.
4836386 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14011 4818871 *p_str = v;
14012 4818871 Py_INCREF(*p_str);
14013 }
14014 else {
14015
2/2
✓ Branch 0 taken 11279 times.
✓ Branch 1 taken 6236 times.
17515 if (arg->ch == 's')
14016 11279 *p_str = PyObject_Str(v);
14017
1/2
✓ Branch 0 taken 6236 times.
✗ Branch 1 not taken.
6236 else if (arg->ch == 'r')
14018 6236 *p_str = PyObject_Repr(v);
14019 else
14020 *p_str = PyObject_ASCII(v);
14021 }
14022 4836386 break;
14023
14024 12210186 case 'i':
14025 case 'd':
14026 case 'u':
14027 case 'o':
14028 case 'x':
14029 case 'X':
14030 {
14031 12210186 int ret = mainformatlong(v, arg, p_str, writer);
14032
2/2
✓ Branch 0 taken 11976930 times.
✓ Branch 1 taken 233256 times.
12210186 if (ret != 0)
14033 11976930 return ret;
14034 233256 arg->sign = 1;
14035 233256 break;
14036 }
14037
14038 3177 case 'e':
14039 case 'E':
14040 case 'f':
14041 case 'F':
14042 case 'g':
14043 case 'G':
14044
2/4
✓ Branch 0 taken 3177 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3177 times.
3177 if (arg->width == -1 && arg->prec == -1
14045 && !(arg->flags & (F_SIGN | F_BLANK)))
14046 {
14047 /* Fast path */
14048 if (formatfloat(v, arg, NULL, writer) == -1)
14049 return -1;
14050 return 1;
14051 }
14052
14053 3177 arg->sign = 1;
14054
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3177 times.
3177 if (formatfloat(v, arg, p_str, NULL) == -1)
14055 return -1;
14056 3177 break;
14057
14058 704 case 'c':
14059 {
14060 704 Py_UCS4 ch = formatchar(v);
14061
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 704 times.
704 if (ch == (Py_UCS4) -1)
14062 return -1;
14063
2/4
✓ Branch 0 taken 704 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 704 times.
✗ Branch 3 not taken.
704 if (arg->width == -1 && arg->prec == -1) {
14064 /* Fast path */
14065
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 704 times.
704 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14066 return -1;
14067 704 return 1;
14068 }
14069 *p_str = PyUnicode_FromOrdinal(ch);
14070 break;
14071 }
14072
14073 default:
14074 PyErr_Format(PyExc_ValueError,
14075 "unsupported format character '%c' (0x%x) "
14076 "at index %zd",
14077 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14078 (int)arg->ch,
14079 ctx->fmtpos - 1);
14080 return -1;
14081 }
14082
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5072819 times.
5072819 if (*p_str == NULL)
14083 return -1;
14084 assert (PyUnicode_Check(*p_str));
14085 5072819 return 0;
14086 }
14087
14088 static int
14089 5072819 unicode_format_arg_output(struct unicode_formatter_t *ctx,
14090 struct unicode_format_arg_t *arg,
14091 PyObject *str)
14092 {
14093 Py_ssize_t len;
14094 int kind;
14095 const void *pbuf;
14096 Py_ssize_t pindex;
14097 Py_UCS4 signchar;
14098 Py_ssize_t buflen;
14099 Py_UCS4 maxchar;
14100 Py_ssize_t sublen;
14101 5072819 _PyUnicodeWriter *writer = &ctx->writer;
14102 Py_UCS4 fill;
14103
14104 5072819 fill = ' ';
14105
4/4
✓ Branch 0 taken 236433 times.
✓ Branch 1 taken 4836386 times.
✓ Branch 2 taken 233256 times.
✓ Branch 3 taken 3177 times.
5072819 if (arg->sign && arg->flags & F_ZERO)
14106 233256 fill = '0';
14107
14108 5072819 len = PyUnicode_GET_LENGTH(str);
14109
4/4
✓ Branch 0 taken 234432 times.
✓ Branch 1 taken 4838387 times.
✓ Branch 2 taken 197202 times.
✓ Branch 3 taken 37230 times.
5072819 if ((arg->width == -1 || arg->width <= len)
14110
3/4
✓ Branch 0 taken 3177 times.
✓ Branch 1 taken 5032412 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 3177 times.
5035589 && (arg->prec == -1 || arg->prec >= len)
14111
1/2
✓ Branch 0 taken 5032412 times.
✗ Branch 1 not taken.
5032412 && !(arg->flags & (F_SIGN | F_BLANK)))
14112 {
14113 /* Fast path */
14114
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 5032412 times.
5032412 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14115 return -1;
14116 5032412 return 0;
14117 }
14118
14119 /* Truncate the string for "s", "r" and "a" formats
14120 if the precision is set */
14121
4/6
✓ Branch 0 taken 39231 times.
✓ Branch 1 taken 1176 times.
✓ Branch 2 taken 39231 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 39231 times.
40407 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14122
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 1176 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
1176 if (arg->prec >= 0 && len > arg->prec)
14123 len = arg->prec;
14124 }
14125
14126 /* Adjust sign and width */
14127 40407 kind = PyUnicode_KIND(str);
14128 40407 pbuf = PyUnicode_DATA(str);
14129 40407 pindex = 0;
14130 40407 signchar = '\0';
14131
2/2
✓ Branch 0 taken 39231 times.
✓ Branch 1 taken 1176 times.
40407 if (arg->sign) {
14132 39231 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14133
2/4
✓ Branch 0 taken 39231 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 39231 times.
39231 if (ch == '-' || ch == '+') {
14134 signchar = ch;
14135 len--;
14136 pindex++;
14137 }
14138
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 39231 times.
39231 else if (arg->flags & F_SIGN)
14139 signchar = '+';
14140
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 39231 times.
39231 else if (arg->flags & F_BLANK)
14141 signchar = ' ';
14142 else
14143 39231 arg->sign = 0;
14144 }
14145
2/2
✓ Branch 0 taken 3177 times.
✓ Branch 1 taken 37230 times.
40407 if (arg->width < len)
14146 3177 arg->width = len;
14147
14148 /* Prepare the writer */
14149 40407 maxchar = writer->maxchar;
14150
1/2
✓ Branch 0 taken 40407 times.
✗ Branch 1 not taken.
40407 if (!(arg->flags & F_LJUST)) {
14151
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 40407 times.
40407 if (arg->sign) {
14152 if ((arg->width-1) > len)
14153 maxchar = Py_MAX(maxchar, fill);
14154 }
14155 else {
14156
2/2
✓ Branch 0 taken 37230 times.
✓ Branch 1 taken 3177 times.
40407 if (arg->width > len)
14157 37230 maxchar = Py_MAX(maxchar, fill);
14158 }
14159 }
14160
2/2
✓ Branch 1 taken 21152 times.
✓ Branch 2 taken 19255 times.
40407 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14161 21152 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14162 21152 maxchar = Py_MAX(maxchar, strmaxchar);
14163 }
14164
14165 40407 buflen = arg->width;
14166
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 40407 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
40407 if (arg->sign && len == arg->width)
14167 buflen++;
14168
5/8
✓ Branch 0 taken 19255 times.
✓ Branch 1 taken 21152 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 19255 times.
✓ Branch 4 taken 21152 times.
✗ Branch 5 not taken.
✗ Branch 7 not taken.
✓ Branch 8 taken 21152 times.
40407 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14169 return -1;
14170
14171 /* Write the sign if needed */
14172
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 40407 times.
40407 if (arg->sign) {
14173 if (fill != ' ') {
14174 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14175 writer->pos += 1;
14176 }
14177 if (arg->width > len)
14178 arg->width--;
14179 }
14180
14181 /* Write the numeric prefix for "x", "X" and "o" formats
14182 if the alternate form is used.
14183 For example, write "0x" for the "%#x" format. */
14184
1/8
✗ Branch 0 not taken.
✓ Branch 1 taken 40407 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
40407 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14185 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14186 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14187 if (fill != ' ') {
14188 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14189 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14190 writer->pos += 2;
14191 pindex += 2;
14192 }
14193 arg->width -= 2;
14194 if (arg->width < 0)
14195 arg->width = 0;
14196 len -= 2;
14197 }
14198
14199 /* Pad left with the fill character if needed */
14200
3/4
✓ Branch 0 taken 37230 times.
✓ Branch 1 taken 3177 times.
✓ Branch 2 taken 37230 times.
✗ Branch 3 not taken.
40407 if (arg->width > len && !(arg->flags & F_LJUST)) {
14201 37230 sublen = arg->width - len;
14202 37230 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
14203 37230 writer->pos += sublen;
14204 37230 arg->width = len;
14205 }
14206
14207 /* If padding with spaces: write sign if needed and/or numeric prefix if
14208 the alternate form is used */
14209
2/2
✓ Branch 0 taken 4353 times.
✓ Branch 1 taken 36054 times.
40407 if (fill == ' ') {
14210
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 4353 times.
4353 if (arg->sign) {
14211 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14212 writer->pos += 1;
14213 }
14214
1/8
✗ Branch 0 not taken.
✓ Branch 1 taken 4353 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
4353 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14215 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14216 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14217 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14218 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14219 writer->pos += 2;
14220 pindex += 2;
14221 }
14222 }
14223
14224 /* Write characters */
14225
1/2
✓ Branch 0 taken 40407 times.
✗ Branch 1 not taken.
40407 if (len) {
14226 40407 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14227 str, pindex, len);
14228 40407 writer->pos += len;
14229 }
14230
14231 /* Pad right with the fill character if needed */
14232
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 40407 times.
40407 if (arg->width > len) {
14233 sublen = arg->width - len;
14234 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
14235 writer->pos += sublen;
14236 }
14237 40407 return 0;
14238 }
14239
14240 /* Helper of PyUnicode_Format(): format one arg.
14241 Return 0 on success, raise an exception and return -1 on error. */
14242 static int
14243 17186510 unicode_format_arg(struct unicode_formatter_t *ctx)
14244 {
14245 struct unicode_format_arg_t arg;
14246 PyObject *str;
14247 int ret;
14248
14249 17186510 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14250
2/2
✓ Branch 0 taken 99 times.
✓ Branch 1 taken 17186411 times.
17186510 if (arg.ch == '%') {
14251 99 ctx->fmtpos++;
14252 99 ctx->fmtcnt--;
14253
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 99 times.
99 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14254 return -1;
14255 99 return 0;
14256 }
14257 17186411 arg.flags = 0;
14258 17186411 arg.width = -1;
14259 17186411 arg.prec = -1;
14260 17186411 arg.sign = 0;
14261 17186411 str = NULL;
14262
14263 17186411 ret = unicode_format_arg_parse(ctx, &arg);
14264
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 17186411 times.
17186411 if (ret == -1)
14265 return -1;
14266
14267 17186411 ret = unicode_format_arg_format(ctx, &arg, &str);
14268
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 17186411 times.
17186411 if (ret == -1)
14269 return -1;
14270
14271
2/2
✓ Branch 0 taken 5072819 times.
✓ Branch 1 taken 12113592 times.
17186411 if (ret != 1) {
14272 5072819 ret = unicode_format_arg_output(ctx, &arg, str);
14273 5072819 Py_DECREF(str);
14274
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5072819 times.
5072819 if (ret == -1)
14275 return -1;
14276 }
14277
14278
3/4
✓ Branch 0 taken 367555 times.
✓ Branch 1 taken 16818856 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 367555 times.
17186411 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
14279 PyErr_SetString(PyExc_TypeError,
14280 "not all arguments converted during string formatting");
14281 return -1;
14282 }
14283 17186411 return 0;
14284 }
14285
14286 PyObject *
14287 11075364 PyUnicode_Format(PyObject *format, PyObject *args)
14288 {
14289 struct unicode_formatter_t ctx;
14290
14291
2/4
✓ Branch 0 taken 11075364 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 11075364 times.
11075364 if (format == NULL || args == NULL) {
14292 PyErr_BadInternalCall();
14293 return NULL;
14294 }
14295
14296
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 11075364 times.
11075364 if (ensure_unicode(format) < 0)
14297 return NULL;
14298
14299 11075364 ctx.fmtstr = format;
14300 11075364 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14301 11075364 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14302 11075364 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14303 11075364 ctx.fmtpos = 0;
14304
14305 11075364 _PyUnicodeWriter_Init(&ctx.writer);
14306 11075364 ctx.writer.min_length = ctx.fmtcnt + 100;
14307 11075364 ctx.writer.overallocate = 1;
14308
14309
2/2
✓ Branch 2 taken 6124371 times.
✓ Branch 3 taken 4950993 times.
11075364 if (PyTuple_Check(args)) {
14310 6124371 ctx.arglen = PyTuple_Size(args);
14311 6124371 ctx.argidx = 0;
14312 }
14313 else {
14314 4950993 ctx.arglen = -1;
14315 4950993 ctx.argidx = -2;
14316 }
14317 11075364 ctx.args_owned = 0;
14318
6/6
✓ Branch 1 taken 10629387 times.
✓ Branch 2 taken 445977 times.
✓ Branch 5 taken 4505016 times.
✓ Branch 6 taken 6124371 times.
✓ Branch 9 taken 262659 times.
✓ Branch 10 taken 4242357 times.
11075364 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14319 262659 ctx.dict = args;
14320 else
14321 10812705 ctx.dict = NULL;
14322 11075364 ctx.args = args;
14323
14324
2/2
✓ Branch 0 taken 34626763 times.
✓ Branch 1 taken 11075364 times.
45702127 while (--ctx.fmtcnt >= 0) {
14325
2/2
✓ Branch 1 taken 17440253 times.
✓ Branch 2 taken 17186510 times.
34626763 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14326 Py_ssize_t nonfmtpos;
14327
14328 17440253 nonfmtpos = ctx.fmtpos++;
14329
4/4
✓ Branch 0 taken 68214847 times.
✓ Branch 1 taken 672837 times.
✓ Branch 2 taken 51447431 times.
✓ Branch 3 taken 16767416 times.
137102531 while (ctx.fmtcnt >= 0 &&
14330 68214847 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14331 51447431 ctx.fmtpos++;
14332 51447431 ctx.fmtcnt--;
14333 }
14334
2/2
✓ Branch 0 taken 672837 times.
✓ Branch 1 taken 16767416 times.
17440253 if (ctx.fmtcnt < 0) {
14335 672837 ctx.fmtpos--;
14336 672837 ctx.writer.overallocate = 0;
14337 }
14338
14339
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 17440253 times.
17440253 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14340 nonfmtpos, ctx.fmtpos) < 0)
14341 goto onError;
14342 }
14343 else {
14344 17186510 ctx.fmtpos++;
14345
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 17186510 times.
17186510 if (unicode_format_arg(&ctx) == -1)
14346 goto onError;
14347 }
14348 }
14349
14350
3/4
✓ Branch 0 taken 8282 times.
✓ Branch 1 taken 11067082 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 8282 times.
11075364 if (ctx.argidx < ctx.arglen && !ctx.dict) {
14351 PyErr_SetString(PyExc_TypeError,
14352 "not all arguments converted during string formatting");
14353 goto onError;
14354 }
14355
14356
2/2
✓ Branch 0 taken 254269 times.
✓ Branch 1 taken 10821095 times.
11075364 if (ctx.args_owned) {
14357 254269 Py_DECREF(ctx.args);
14358 }
14359 11075364 return _PyUnicodeWriter_Finish(&ctx.writer);
14360
14361 onError:
14362 _PyUnicodeWriter_Dealloc(&ctx.writer);
14363 if (ctx.args_owned) {
14364 Py_DECREF(ctx.args);
14365 }
14366 return NULL;
14367 }
14368
14369 static PyObject *
14370 unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
14371
14372 /*[clinic input]
14373 @classmethod
14374 str.__new__ as unicode_new
14375
14376 object as x: object = NULL
14377 encoding: str = NULL
14378 errors: str = NULL
14379
14380 [clinic start generated code]*/
14381
14382 static PyObject *
14383 5134118 unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
14384 const char *errors)
14385 /*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
14386 {
14387 PyObject *unicode;
14388
2/2
✓ Branch 0 taken 1035 times.
✓ Branch 1 taken 5133083 times.
5134118 if (x == NULL) {
14389 1035 unicode = unicode_new_empty();
14390 }
14391
3/4
✓ Branch 0 taken 4485243 times.
✓ Branch 1 taken 647840 times.
✓ Branch 2 taken 4485243 times.
✗ Branch 3 not taken.
5133083 else if (encoding == NULL && errors == NULL) {
14392 4485243 unicode = PyObject_Str(x);
14393 }
14394 else {
14395 647840 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
14396 }
14397
14398
3/4
✓ Branch 0 taken 5134118 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2893096 times.
✓ Branch 3 taken 2241022 times.
5134118 if (unicode != NULL && type != &PyUnicode_Type) {
14399 2893096 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
14400 }
14401 5134118 return unicode;
14402 }
14403
14404 static PyObject *
14405 2893096 unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
14406 {
14407 PyObject *self;
14408 Py_ssize_t length, char_size;
14409 int share_utf8;
14410 int kind;
14411 void *data;
14412
14413 assert(PyType_IsSubtype(type, &PyUnicode_Type));
14414 assert(_PyUnicode_CHECK(unicode));
14415
14416 2893096 self = type->tp_alloc(type, 0);
14417
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2893096 times.
2893096 if (self == NULL) {
14418 return NULL;
14419 }
14420 2893096 kind = PyUnicode_KIND(unicode);
14421 2893096 length = PyUnicode_GET_LENGTH(unicode);
14422
14423 2893096 _PyUnicode_LENGTH(self) = length;
14424 #ifdef Py_DEBUG
14425 _PyUnicode_HASH(self) = -1;
14426 #else
14427 2893096 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14428 #endif
14429 2893096 _PyUnicode_STATE(self).interned = 0;
14430 2893096 _PyUnicode_STATE(self).kind = kind;
14431 2893096 _PyUnicode_STATE(self).compact = 0;
14432 2893096 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14433 2893096 _PyUnicode_UTF8_LENGTH(self) = 0;
14434 2893096 _PyUnicode_UTF8(self) = NULL;
14435 2893096 _PyUnicode_DATA_ANY(self) = NULL;
14436
14437 2893096 share_utf8 = 0;
14438
1/2
✓ Branch 0 taken 2893096 times.
✗ Branch 1 not taken.
2893096 if (kind == PyUnicode_1BYTE_KIND) {
14439 2893096 char_size = 1;
14440
1/2
✓ Branch 1 taken 2893096 times.
✗ Branch 2 not taken.
2893096 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14441 2893096 share_utf8 = 1;
14442 }
14443 else if (kind == PyUnicode_2BYTE_KIND) {
14444 char_size = 2;
14445 }
14446 else {
14447 assert(kind == PyUnicode_4BYTE_KIND);
14448 char_size = 4;
14449 }
14450
14451 /* Ensure we won't overflow the length. */
14452
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2893096 times.
2893096 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14453 PyErr_NoMemory();
14454 goto onError;
14455 }
14456 2893096 data = PyObject_Malloc((length + 1) * char_size);
14457
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2893096 times.
2893096 if (data == NULL) {
14458 PyErr_NoMemory();
14459 goto onError;
14460 }
14461
14462 2893096 _PyUnicode_DATA_ANY(self) = data;
14463
1/2
✓ Branch 0 taken 2893096 times.
✗ Branch 1 not taken.
2893096 if (share_utf8) {
14464 2893096 _PyUnicode_UTF8_LENGTH(self) = length;
14465 2893096 _PyUnicode_UTF8(self) = data;
14466 }
14467
14468 2893096 memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
14469 assert(_PyUnicode_CheckConsistency(self, 1));
14470 #ifdef Py_DEBUG
14471 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14472 #endif
14473 2893096 return self;
14474
14475 onError:
14476 Py_DECREF(self);
14477 return NULL;
14478 }
14479
14480 void
14481 16372210 _PyUnicode_ExactDealloc(PyObject *op)
14482 {
14483 assert(PyUnicode_CheckExact(op));
14484 16372210 unicode_dealloc(op);
14485 16372210 }
14486
14487 PyDoc_STRVAR(unicode_doc,
14488 "str(object='') -> str\n\
14489 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14490 \n\
14491 Create a new string object from the given object. If encoding or\n\
14492 errors is specified, then the object must expose a data buffer\n\
14493 that will be decoded using the given encoding and error handler.\n\
14494 Otherwise, returns the result of object.__str__() (if defined)\n\
14495 or repr(object).\n\
14496 encoding defaults to sys.getdefaultencoding().\n\
14497 errors defaults to 'strict'.");
14498
14499 static PyObject *unicode_iter(PyObject *seq);
14500
14501 PyTypeObject PyUnicode_Type = {
14502 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14503 "str", /* tp_name */
14504 sizeof(PyUnicodeObject), /* tp_basicsize */
14505 0, /* tp_itemsize */
14506 /* Slots */
14507 (destructor)unicode_dealloc, /* tp_dealloc */
14508 0, /* tp_vectorcall_offset */
14509 0, /* tp_getattr */
14510 0, /* tp_setattr */
14511 0, /* tp_as_async */
14512 unicode_repr, /* tp_repr */
14513 &unicode_as_number, /* tp_as_number */
14514 &unicode_as_sequence, /* tp_as_sequence */
14515 &unicode_as_mapping, /* tp_as_mapping */
14516 (hashfunc) unicode_hash, /* tp_hash*/
14517 0, /* tp_call*/
14518 (reprfunc) unicode_str, /* tp_str */
14519 PyObject_GenericGetAttr, /* tp_getattro */
14520 0, /* tp_setattro */
14521 0, /* tp_as_buffer */
14522 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14523 Py_TPFLAGS_UNICODE_SUBCLASS |
14524 _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14525 unicode_doc, /* tp_doc */
14526 0, /* tp_traverse */
14527 0, /* tp_clear */
14528 PyUnicode_RichCompare, /* tp_richcompare */
14529 0, /* tp_weaklistoffset */
14530 unicode_iter, /* tp_iter */
14531 0, /* tp_iternext */
14532 unicode_methods, /* tp_methods */
14533 0, /* tp_members */
14534 0, /* tp_getset */
14535 0, /* tp_base */
14536 0, /* tp_dict */
14537 0, /* tp_descr_get */
14538 0, /* tp_descr_set */
14539 0, /* tp_dictoffset */
14540 0, /* tp_init */
14541 0, /* tp_alloc */
14542 unicode_new, /* tp_new */
14543 PyObject_Del, /* tp_free */
14544 };
14545
14546 /* Initialize the Unicode implementation */
14547
14548 void
14549 3408 _PyUnicode_InitState(PyInterpreterState *interp)
14550 {
14551
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3408 times.
3408 if (!_Py_IsMainInterpreter(interp)) {
14552 return;
14553 }
14554
14555 /* initialize the linebreak bloom filter */
14556 3408 const Py_UCS2 linebreak[] = {
14557 0x000A, /* LINE FEED */
14558 0x000D, /* CARRIAGE RETURN */
14559 0x001C, /* FILE SEPARATOR */
14560 0x001D, /* GROUP SEPARATOR */
14561 0x001E, /* RECORD SEPARATOR */
14562 0x0085, /* NEXT LINE */
14563 0x2028, /* LINE SEPARATOR */
14564 0x2029, /* PARAGRAPH SEPARATOR */
14565 };
14566 3408 bloom_linebreak = make_bloom_mask(
14567 PyUnicode_2BYTE_KIND, linebreak,
14568 Py_ARRAY_LENGTH(linebreak));
14569 }
14570
14571
14572 PyStatus
14573 3408 _PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14574 {
14575
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3408 times.
3408 if (!_Py_IsMainInterpreter(interp)) {
14576 return _PyStatus_OK();
14577 }
14578
14579 #ifdef Py_DEBUG
14580 assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
14581
14582 for (int i = 0; i < 256; i++) {
14583 assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
14584 }
14585 #endif
14586
14587 3408 return _PyStatus_OK();
14588 }
14589
14590
14591 PyStatus
14592 3408 _PyUnicode_InitTypes(PyInterpreterState *interp)
14593 {
14594
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3408 times.
3408 if (!_Py_IsMainInterpreter(interp)) {
14595 return _PyStatus_OK();
14596 }
14597
14598
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3408 times.
3408 if (PyType_Ready(&EncodingMapType) < 0) {
14599 goto error;
14600 }
14601
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3408 times.
3408 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
14602 goto error;
14603 }
14604
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3408 times.
3408 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
14605 goto error;
14606 }
14607 3408 return _PyStatus_OK();
14608
14609 error:
14610 return _PyStatus_ERR("Can't initialize unicode types");
14611 }
14612
14613
14614 void
14615 581722783 PyUnicode_InternInPlace(PyObject **p)
14616 {
14617 581722783 PyObject *s = *p;
14618 #ifdef Py_DEBUG
14619 assert(s != NULL);
14620 assert(_PyUnicode_CHECK(s));
14621 #else
14622
2/4
✓ Branch 0 taken 581722783 times.
✗ Branch 1 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 581722783 times.
581722783 if (s == NULL || !PyUnicode_Check(s)) {
14623 return;
14624 }
14625 #endif
14626
14627 /* If it's a subclass, we don't really know what putting
14628 it in the interned dict might do. */
14629
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 581722783 times.
581722783 if (!PyUnicode_CheckExact(s)) {
14630 return;
14631 }
14632
14633
2/2
✓ Branch 1 taken 476475200 times.
✓ Branch 2 taken 105247583 times.
581722783 if (PyUnicode_CHECK_INTERNED(s)) {
14634 476475200 return;
14635 }
14636
14637
2/2
✓ Branch 0 taken 3408 times.
✓ Branch 1 taken 105244175 times.
105247583 if (interned == NULL) {
14638 3408 interned = PyDict_New();
14639
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3408 times.
3408 if (interned == NULL) {
14640 PyErr_Clear(); /* Don't leave an exception */
14641 return;
14642 }
14643 }
14644
14645 105247583 PyObject *t = PyDict_SetDefault(interned, s, s);
14646
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 105247583 times.
105247583 if (t == NULL) {
14647 PyErr_Clear();
14648 return;
14649 }
14650
14651
2/2
✓ Branch 0 taken 69613511 times.
✓ Branch 1 taken 35634072 times.
105247583 if (t != s) {
14652 69613511 Py_INCREF(t);
14653 69613511 Py_SETREF(*p, t);
14654 69613511 return;
14655 }
14656
14657 /* The two references in interned dict (key and value) are not counted by
14658 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
14659 this. */
14660 35634072 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
14661 35634072 _PyUnicode_STATE(s).interned = 1;
14662 }
14663
14664 // Function kept for the stable ABI.
14665 PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14666 void
14667 PyUnicode_InternImmortal(PyObject **p)
14668 {
14669 PyUnicode_InternInPlace(p);
14670 // Leak a reference on purpose
14671 Py_INCREF(*p);
14672 }
14673
14674 PyObject *
14675 12829385 PyUnicode_InternFromString(const char *cp)
14676 {
14677 12829385 PyObject *s = PyUnicode_FromString(cp);
14678
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 12829385 times.
12829385 if (s == NULL)
14679 return NULL;
14680 12829385 PyUnicode_InternInPlace(&s);
14681 12829385 return s;
14682 }
14683
14684
14685 void
14686 3404 _PyUnicode_ClearInterned(PyInterpreterState *interp)
14687 {
14688
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3404 times.
3404 if (!_Py_IsMainInterpreter(interp)) {
14689 // interned dict is shared by all interpreters
14690 return;
14691 }
14692
14693
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3404 times.
3404 if (interned == NULL) {
14694 return;
14695 }
14696 assert(PyDict_CheckExact(interned));
14697
14698 /* Interned unicode strings are not forcibly deallocated; rather, we give
14699 them their stolen references back, and then clear and DECREF the
14700 interned dict. */
14701
14702 #ifdef INTERNED_STATS
14703 fprintf(stderr, "releasing %zd interned strings\n",
14704 PyDict_GET_SIZE(interned));
14705
14706 Py_ssize_t total_length = 0;
14707 #endif
14708 3404 Py_ssize_t pos = 0;
14709 PyObject *s, *ignored_value;
14710
2/2
✓ Branch 1 taken 6545674 times.
✓ Branch 2 taken 3404 times.
6549078 while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
14711 assert(PyUnicode_CHECK_INTERNED(s));
14712 // Restore the two references (key and value) ignored
14713 // by PyUnicode_InternInPlace().
14714 6545674 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
14715 #ifdef INTERNED_STATS
14716 total_length += PyUnicode_GET_LENGTH(s);
14717 #endif
14718
14719 6545674 _PyUnicode_STATE(s).interned = 0;
14720 }
14721 #ifdef INTERNED_STATS
14722 fprintf(stderr,
14723 "total length of all interned strings: %zd characters\n",
14724 total_length);
14725 #endif
14726
14727 3404 PyDict_Clear(interned);
14728
1/2
✓ Branch 0 taken 3404 times.
✗ Branch 1 not taken.
3404 Py_CLEAR(interned);
14729 }
14730
14731
14732 /********************* Unicode Iterator **************************/
14733
14734 typedef struct {
14735 PyObject_HEAD
14736 Py_ssize_t it_index;
14737 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
14738 } unicodeiterobject;
14739
14740 static void
14741 1151258 unicodeiter_dealloc(unicodeiterobject *it)
14742 {
14743 1151258 _PyObject_GC_UNTRACK(it);
14744 1151258 Py_XDECREF(it->it_seq);
14745 1151258 PyObject_GC_Del(it);
14746 1151258 }
14747
14748 static int
14749 264 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14750 {
14751
2/4
✓ Branch 0 taken 264 times.
✗ Branch 1 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 264 times.
264 Py_VISIT(it->it_seq);
14752 264 return 0;
14753 }
14754
14755 static PyObject *
14756 1214466 unicodeiter_next(unicodeiterobject *it)
14757 {
14758 PyObject *seq;
14759
14760 assert(it != NULL);
14761 1214466 seq = it->it_seq;
14762
2/2
✓ Branch 0 taken 1435 times.
✓ Branch 1 taken 1213031 times.
1214466 if (seq == NULL)
14763 1435 return NULL;
14764 assert(_PyUnicode_CHECK(seq));
14765
14766
2/2
✓ Branch 1 taken 1186014 times.
✓ Branch 2 taken 27017 times.
1213031 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14767 1186014 int kind = PyUnicode_KIND(seq);
14768 1186014 const void *data = PyUnicode_DATA(seq);
14769 1186014 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14770 1186014 it->it_index++;
14771 1186014 return unicode_char(chr);
14772 }
14773
14774 27017 it->it_seq = NULL;
14775 27017 Py_DECREF(seq);
14776 27017 return NULL;
14777 }
14778
14779 static PyObject *
14780 5385899 unicode_ascii_iter_next(unicodeiterobject *it)
14781 {
14782 assert(it != NULL);
14783 5385899 PyObject *seq = it->it_seq;
14784
2/2
✓ Branch 0 taken 6888 times.
✓ Branch 1 taken 5379011 times.
5385899 if (seq == NULL) {
14785 6888 return NULL;
14786 }
14787 assert(_PyUnicode_CHECK(seq));
14788 assert(PyUnicode_IS_COMPACT_ASCII(seq));
14789
2/2
✓ Branch 1 taken 4261568 times.
✓ Branch 2 taken 1117443 times.
5379011 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14790 4261568 const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
14791 4261568 Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
14792 data, it->it_index);
14793 4261568 it->it_index++;
14794 4261568 PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
14795 4261568 return Py_NewRef(item);
14796 }
14797 1117443 it->it_seq = NULL;
14798 1117443 Py_DECREF(seq);
14799 1117443 return NULL;
14800 }
14801
14802 static PyObject *
14803 1653 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
14804 {
14805 1653 Py_ssize_t len = 0;
14806
1/2
✓ Branch 0 taken 1653 times.
✗ Branch 1 not taken.
1653 if (it->it_seq)
14807 1653 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14808 1653 return PyLong_FromSsize_t(len);
14809 }
14810
14811 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14812
14813 static PyObject *
14814 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
14815 {
14816 if (it->it_seq != NULL) {
14817 return Py_BuildValue("N(O)n", _PyEval_GetBuiltin(&_Py_ID(iter)),
14818 it->it_seq, it->it_index);
14819 } else {
14820 PyObject *u = unicode_new_empty();
14821 if (u == NULL)
14822 return NULL;
14823 return Py_BuildValue("N(N)", _PyEval_GetBuiltin(&_Py_ID(iter)), u);
14824 }
14825 }
14826
14827 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14828
14829 static PyObject *
14830 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14831 {
14832 Py_ssize_t index = PyLong_AsSsize_t(state);
14833 if (index == -1 && PyErr_Occurred())
14834 return NULL;
14835 if (it->it_seq != NULL) {
14836 if (index < 0)
14837 index = 0;
14838 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14839 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14840 it->it_index = index;
14841 }
14842 Py_RETURN_NONE;
14843 }
14844
14845 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14846
14847 static PyMethodDef unicodeiter_methods[] = {
14848 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
14849 length_hint_doc},
14850 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14851 reduce_doc},
14852 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14853 setstate_doc},
14854 {NULL, NULL} /* sentinel */
14855 };
14856
14857 PyTypeObject PyUnicodeIter_Type = {
14858 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14859 "str_iterator", /* tp_name */
14860 sizeof(unicodeiterobject), /* tp_basicsize */
14861 0, /* tp_itemsize */
14862 /* methods */
14863 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14864 0, /* tp_vectorcall_offset */
14865 0, /* tp_getattr */
14866 0, /* tp_setattr */
14867 0, /* tp_as_async */
14868 0, /* tp_repr */
14869 0, /* tp_as_number */
14870 0, /* tp_as_sequence */
14871 0, /* tp_as_mapping */
14872 0, /* tp_hash */
14873 0, /* tp_call */
14874 0, /* tp_str */
14875 PyObject_GenericGetAttr, /* tp_getattro */
14876 0, /* tp_setattro */
14877 0, /* tp_as_buffer */
14878 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14879 0, /* tp_doc */
14880 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14881 0, /* tp_clear */
14882 0, /* tp_richcompare */
14883 0, /* tp_weaklistoffset */
14884 PyObject_SelfIter, /* tp_iter */
14885 (iternextfunc)unicodeiter_next, /* tp_iternext */
14886 unicodeiter_methods, /* tp_methods */
14887 0,
14888 };
14889
14890 PyTypeObject _PyUnicodeASCIIIter_Type = {
14891 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14892 .tp_name = "str_ascii_iterator",
14893 .tp_basicsize = sizeof(unicodeiterobject),
14894 .tp_dealloc = (destructor)unicodeiter_dealloc,
14895 .tp_getattro = PyObject_GenericGetAttr,
14896 .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
14897 .tp_traverse = (traverseproc)unicodeiter_traverse,
14898 .tp_iter = PyObject_SelfIter,
14899 .tp_iternext = (iternextfunc)unicode_ascii_iter_next,
14900 .tp_methods = unicodeiter_methods,
14901 };
14902
14903 static PyObject *
14904 1151258 unicode_iter(PyObject *seq)
14905 {
14906 unicodeiterobject *it;
14907
14908
1/2
✗ Branch 2 not taken.
✓ Branch 3 taken 1151258 times.
1151258 if (!PyUnicode_Check(seq)) {
14909 PyErr_BadInternalCall();
14910 return NULL;
14911 }
14912
2/2
✓ Branch 1 taken 1124241 times.
✓ Branch 2 taken 27017 times.
1151258 if (PyUnicode_IS_COMPACT_ASCII(seq)) {
14913 1124241 it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
14914 }
14915 else {
14916 27017 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14917 }
14918
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1151258 times.
1151258 if (it == NULL)
14919 return NULL;
14920 1151258 it->it_index = 0;
14921 1151258 Py_INCREF(seq);
14922 1151258 it->it_seq = seq;
14923 1151258 _PyObject_GC_TRACK(it);
14924 1151258 return (PyObject *)it;
14925 }
14926
14927 static int
14928 13616 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
14929 {
14930 int res;
14931 13616 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
14932
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 13616 times.
13616 if (res == -2) {
14933 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
14934 return -1;
14935 }
14936
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 13616 times.
13616 if (res < 0) {
14937 PyErr_NoMemory();
14938 return -1;
14939 }
14940 13616 return 0;
14941 }
14942
14943
14944 static int
14945 6808 config_get_codec_name(wchar_t **config_encoding)
14946 {
14947 char *encoding;
14948
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 6808 times.
6808 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
14949 return -1;
14950 }
14951
14952 6808 PyObject *name_obj = NULL;
14953 6808 PyObject *codec = _PyCodec_Lookup(encoding);
14954 6808 PyMem_RawFree(encoding);
14955
14956
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6808 times.
6808 if (!codec)
14957 goto error;
14958
14959 6808 name_obj = PyObject_GetAttrString(codec, "name");
14960
1/2
✓ Branch 0 taken 6808 times.
✗ Branch 1 not taken.
6808 Py_CLEAR(codec);
14961
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6808 times.
6808 if (!name_obj) {
14962 goto error;
14963 }
14964
14965 6808 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
14966 6808 Py_DECREF(name_obj);
14967
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6808 times.
6808 if (wname == NULL) {
14968 goto error;
14969 }
14970
14971 6808 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
14972
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6808 times.
6808 if (raw_wname == NULL) {
14973 PyMem_Free(wname);
14974 PyErr_NoMemory();
14975 goto error;
14976 }
14977
14978 6808 PyMem_RawFree(*config_encoding);
14979 6808 *config_encoding = raw_wname;
14980
14981 6808 PyMem_Free(wname);
14982 6808 return 0;
14983
14984 error:
14985 Py_XDECREF(codec);
14986 Py_XDECREF(name_obj);
14987 return -1;
14988 }
14989
14990
14991 static PyStatus
14992 3404 init_stdio_encoding(PyInterpreterState *interp)
14993 {
14994 /* Update the stdio encoding to the normalized Python codec name. */
14995 3404 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14996
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3404 times.
3404 if (config_get_codec_name(&config->stdio_encoding) < 0) {
14997 return _PyStatus_ERR("failed to get the Python codec name "
14998 "of the stdio encoding");
14999 }
15000 3404 return _PyStatus_OK();
15001 }
15002
15003
15004 static int
15005 3404 init_fs_codec(PyInterpreterState *interp)
15006 {
15007 3404 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
15008
15009 _Py_error_handler error_handler;
15010 3404 error_handler = get_error_handler_wide(config->filesystem_errors);
15011
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3404 times.
3404 if (error_handler == _Py_ERROR_UNKNOWN) {
15012 PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
15013 return -1;
15014 }
15015
15016 char *encoding, *errors;
15017
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3404 times.
3404 if (encode_wstr_utf8(config->filesystem_encoding,
15018 &encoding,
15019 "filesystem_encoding") < 0) {
15020 return -1;
15021 }
15022
15023
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3404 times.
3404 if (encode_wstr_utf8(config->filesystem_errors,
15024 &errors,
15025 "filesystem_errors") < 0) {
15026 PyMem_RawFree(encoding);
15027 return -1;
15028 }
15029
15030 3404 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
15031 3404 PyMem_RawFree(fs_codec->encoding);
15032 3404 fs_codec->encoding = encoding;
15033 /* encoding has been normalized by init_fs_encoding() */
15034 3404 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
15035 3404 PyMem_RawFree(fs_codec->errors);
15036 3404 fs_codec->errors = errors;
15037 3404 fs_codec->error_handler = error_handler;
15038
15039 #ifdef _Py_FORCE_UTF8_FS_ENCODING
15040 assert(fs_codec->utf8 == 1);
15041 #endif
15042
15043 /* At this point, PyUnicode_EncodeFSDefault() and
15044 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15045 the C implementation of the filesystem encoding. */
15046
15047 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15048 global configuration variables. */
15049
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3404 times.
3404 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
15050 3404 fs_codec->errors) < 0) {
15051 PyErr_NoMemory();
15052 return -1;
15053 }
15054 3404 return 0;
15055 }
15056
15057
15058 static PyStatus
15059 3404 init_fs_encoding(PyThreadState *tstate)
15060 {
15061 3404 PyInterpreterState *interp = tstate->interp;
15062
15063 /* Update the filesystem encoding to the normalized Python codec name.
15064 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15065 (Python codec name). */
15066 3404 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
15067
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3404 times.
3404 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
15068 _Py_DumpPathConfig(tstate);
15069 return _PyStatus_ERR("failed to get the Python codec "
15070 "of the filesystem encoding");
15071 }
15072
15073
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3404 times.
3404 if (init_fs_codec(interp) < 0) {
15074 return _PyStatus_ERR("cannot initialize filesystem codec");
15075 }
15076 3404 return _PyStatus_OK();
15077 }
15078
15079
15080 PyStatus
15081 3404 _PyUnicode_InitEncodings(PyThreadState *tstate)
15082 {
15083 3404 PyStatus status = init_fs_encoding(tstate);
15084
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3404 times.
3404 if (_PyStatus_EXCEPTION(status)) {
15085 return status;
15086 }
15087
15088 3404 return init_stdio_encoding(tstate->interp);
15089 }
15090
15091
15092 static void
15093 3404 _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
15094 {
15095 3404 PyMem_RawFree(fs_codec->encoding);
15096 3404 fs_codec->encoding = NULL;
15097 3404 fs_codec->utf8 = 0;
15098 3404 PyMem_RawFree(fs_codec->errors);
15099 3404 fs_codec->errors = NULL;
15100 3404 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
15101 3404 }
15102
15103
15104 #ifdef MS_WINDOWS
15105 int
15106 _PyUnicode_EnableLegacyWindowsFSEncoding(void)
15107 {
15108 PyInterpreterState *interp = _PyInterpreterState_GET();
15109 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
15110
15111 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15112 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15113 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15114 if (encoding == NULL || errors == NULL) {
15115 PyMem_RawFree(encoding);
15116 PyMem_RawFree(errors);
15117 PyErr_NoMemory();
15118 return -1;
15119 }
15120
15121 PyMem_RawFree(config->filesystem_encoding);
15122 config->filesystem_encoding = encoding;
15123 PyMem_RawFree(config->filesystem_errors);
15124 config->filesystem_errors = errors;
15125
15126 return init_fs_codec(interp);
15127 }
15128 #endif
15129
15130
15131 #ifdef Py_DEBUG
15132 static inline int
15133 unicode_is_finalizing(void)
15134 {
15135 return (interned == NULL);
15136 }
15137 #endif
15138
15139
15140 void
15141 3404 _PyUnicode_FiniTypes(PyInterpreterState *interp)
15142 {
15143
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3404 times.
3404 if (!_Py_IsMainInterpreter(interp)) {
15144 return;
15145 }
15146
15147 3404 _PyStaticType_Dealloc(&EncodingMapType);
15148 3404 _PyStaticType_Dealloc(&PyFieldNameIter_Type);
15149 3404 _PyStaticType_Dealloc(&PyFormatterIter_Type);
15150 }
15151
15152
15153 9626628 static void unicode_static_dealloc(PyObject *op)
15154 {
15155 9626628 PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
15156
15157 assert(ascii->state.compact);
15158
15159
2/2
✓ Branch 0 taken 456010 times.
✓ Branch 1 taken 9170618 times.
9626628 if (!ascii->state.ascii) {
15160 456010 PyCompactUnicodeObject* compact = (PyCompactUnicodeObject*)op;
15161
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 456010 times.
456010 if (compact->utf8) {
15162 PyObject_Free(compact->utf8);
15163 compact->utf8 = NULL;
15164 compact->utf8_length = 0;
15165 }
15166 }
15167 9626628 }
15168
15169
15170 void
15171 3404 _PyUnicode_Fini(PyInterpreterState *interp)
15172 {
15173 3404 struct _Py_unicode_state *state = &interp->unicode;
15174
15175
1/2
✓ Branch 1 taken 3404 times.
✗ Branch 2 not taken.
3404 if (_Py_IsMainInterpreter(interp)) {
15176 // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
15177 assert(interned == NULL);
15178 // bpo-47182: force a unicodedata CAPI capsule re-import on
15179 // subsequent initialization of main interpreter.
15180 3404 ucnhash_capi = NULL;
15181 }
15182
15183 3404 _PyUnicode_FiniEncodings(&state->fs_codec);
15184
15185 3404 unicode_clear_identifiers(state);
15186
15187 // Clear the single character singletons
15188
2/2
✓ Branch 0 taken 435712 times.
✓ Branch 1 taken 3404 times.
439116 for (int i = 0; i < 128; i++) {
15189 435712 unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).ascii[i]);
15190 }
15191
2/2
✓ Branch 0 taken 435712 times.
✓ Branch 1 taken 3404 times.
439116 for (int i = 0; i < 128; i++) {
15192 435712 unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).latin1[i]);
15193 }
15194 3404 }
15195
15196
15197 void
15198 8755204 _PyStaticUnicode_Dealloc(PyObject *op)
15199 {
15200 8755204 unicode_static_dealloc(op);
15201 8755204 }
15202
15203
15204 /* A _string module, to export formatter_parser and formatter_field_name_split
15205 to the string.Formatter class implemented in Python. */
15206
15207 static PyMethodDef _string_methods[] = {
15208 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15209 METH_O, PyDoc_STR("split the argument as a field name")},
15210 {"formatter_parser", (PyCFunction) formatter_parser,
15211 METH_O, PyDoc_STR("parse the argument as a format string")},
15212 {NULL, NULL}
15213 };
15214
15215 static struct PyModuleDef _string_module = {
15216 PyModuleDef_HEAD_INIT,
15217 .m_name = "_string",
15218 .m_doc = PyDoc_STR("string helper module"),
15219 .m_size = 0,
15220 .m_methods = _string_methods,
15221 };
15222
15223 PyMODINIT_FUNC
15224 748 PyInit__string(void)
15225 {
15226 748 return PyModuleDef_Init(&_string_module);
15227 }
15228
15229
15230 #ifdef __cplusplus
15231 }
15232 #endif
15233